#include "ShardDBTools.hpp" #include #include #include #include #include #include "Common.hpp" #include "Env.hpp" #include "LogsDB.hpp" #include "LogsDBTools.hpp" #include "Msgs.hpp" #include "RocksDBUtils.hpp" #include "ShardDB.hpp" #include "SharedRocksDB.hpp" #include "ShardDBData.hpp" namespace rocksdb { std::ostream& operator<<(std::ostream& out, const rocksdb::Slice& slice) { return goLangBytesFmt(out, (const char*)slice.data(), slice.size()); } } void ShardDBTools::verifyEqual(const std::string &db1Path, const std::string &db2Path) { Logger logger(LogLevel::LOG_INFO, STDERR_FILENO, false, false); std::shared_ptr xmon; Env env(logger, xmon, "ShardDBTools"); SharedRocksDB sharedDb1(logger, xmon, db1Path, ""); sharedDb1.registerCFDescriptors(ShardDB::getColumnFamilyDescriptors()); sharedDb1.registerCFDescriptors(LogsDB::getColumnFamilyDescriptors()); rocksdb::Options rocksDBOptions; rocksDBOptions.compression = rocksdb::kLZ4Compression; rocksDBOptions.bottommost_compression = rocksdb::kZSTD; sharedDb1.openForReadOnly(rocksDBOptions); SharedRocksDB sharedDb2(logger, xmon, db2Path, ""); sharedDb2.registerCFDescriptors(ShardDB::getColumnFamilyDescriptors()); sharedDb2.registerCFDescriptors(LogsDB::getColumnFamilyDescriptors()); sharedDb2.openForReadOnly(rocksDBOptions); auto db1 = sharedDb1.db(); auto db2 = sharedDb2.db(); size_t totalKeysCompared{0}; size_t totalKeySize{0}; size_t totalValueSize{0}; for(const auto& cf : ShardDB::getColumnFamilyDescriptors()) { LOG_INFO(env, "Starting comparison on CF %s", cf.name); auto cf1 = sharedDb1.getCF(cf.name); auto cf2 = sharedDb2.getCF(cf.name); size_t keysCompared{0}; size_t keySize{0}; size_t valueSize{0}; auto it1 = db1->NewIterator({}, cf1); auto it2 = db2->NewIterator({}, cf2); it1->SeekToFirst(); it2->SeekToFirst(); while (it1->Valid() && it2->Valid()) { if (it1->key() != it2->key()) { LOG_ERROR(env, "Found mismatch in key cf %s, key1: %s, key2: %s", cf.name, it1->key(), it2->key()); return; } if (it1->value() != it2->value()) { LOG_ERROR(env, "Found mismatch in value cf %s", cf.name); return; } ++keysCompared; keySize += it1->key().size(); valueSize += it1->value().size(); if (keysCompared % 100000000ull == 0) { LOG_INFO(env, "Compared %s key/value pairs so far and they are identical", keysCompared); } it1->Next(); it2->Next(); } if (it1->Valid()) { LOG_ERROR(env, "Database %s has extra keys in cf %s", db1Path, cf.name); return; } if (it2->Valid()) { LOG_ERROR(env, "Database %s has extra keys in cf %s", db2Path, cf.name); return; } delete it1; delete it2; LOG_INFO(env, "CF %s identical. Compared %s key/value pairs. KeySize: %s, ValueSize: %s", cf.name, keysCompared, keySize, valueSize); totalKeysCompared += keysCompared; totalKeySize += keySize; totalValueSize += valueSize; } LOG_INFO(env, "Databases identical! Compared %s cfs, %s key/value pairs. Total key size: %s, Total value size: %s", ShardDB::getColumnFamilyDescriptors().size(), totalKeysCompared, totalKeySize, totalValueSize); } std::ostream& operator<<(std::ostream& out, const std::vector& indices) { out << "["; for (auto idx : indices) { out << idx << ", "; } out << "]"; return out; } void ShardDBTools::outputUnreleasedState(const std::string& dbPath) { Logger logger(LogLevel::LOG_INFO, STDERR_FILENO, false, false); std::shared_ptr xmon; Env env(logger, xmon, "ShardDBTools"); SharedRocksDB sharedDb(logger, xmon, dbPath, ""); sharedDb.registerCFDescriptors(ShardDB::getColumnFamilyDescriptors()); sharedDb.registerCFDescriptors(LogsDB::getColumnFamilyDescriptors()); rocksdb::Options rocksDBOptions; rocksDBOptions.compression = rocksdb::kLZ4Compression; rocksDBOptions.bottommost_compression = rocksdb::kZSTD; sharedDb.openForReadOnly(rocksDBOptions); LogIdx lastReleased; std::vector unreleasedLogEntries; LogsDBTools::getUnreleasedLogEntries(env, sharedDb, lastReleased, unreleasedLogEntries); LOG_INFO(env, "Last released: %s", lastReleased); LOG_INFO(env, "Unreleased entries: %s", unreleasedLogEntries); } void ShardDBTools::fsck(const std::string& dbPath) { Logger logger(LogLevel::LOG_INFO, STDERR_FILENO, false, false); std::shared_ptr xmon; Env env(logger, xmon, "ShardDBTools"); SharedRocksDB sharedDb(logger, xmon, dbPath, ""); sharedDb.registerCFDescriptors(ShardDB::getColumnFamilyDescriptors()); sharedDb.registerCFDescriptors(LogsDB::getColumnFamilyDescriptors()); rocksdb::Options rocksDBOptions; rocksDBOptions.compression = rocksdb::kLZ4Compression; rocksDBOptions.bottommost_compression = rocksdb::kZSTD; sharedDb.openForReadOnly(rocksDBOptions); auto db = sharedDb.db(); bool anyErrors = false; #define ERROR(...) do { \ LOG_ERROR(env, __VA_ARGS__); \ anyErrors = true; \ } while (0) const rocksdb::Snapshot* snapshotPtr = db->GetSnapshot(); ALWAYS_ASSERT(snapshotPtr != nullptr); const auto snapshotDeleter = [db](const rocksdb::Snapshot* ptr) { db->ReleaseSnapshot(ptr); }; std::unique_ptr snapshot(snapshotPtr, snapshotDeleter); auto directoriesCf = sharedDb.getCF("directories"); auto edgesCf = sharedDb.getCF("edges"); auto filesCf = sharedDb.getCF("files"); auto spansCf = sharedDb.getCF("spans"); auto transientFilesCf = sharedDb.getCF("transientFiles"); auto blockServicesToFilesCf = sharedDb.getCF("blockServicesToFiles"); rocksdb::ReadOptions options; options.snapshot = snapshot.get(); LOG_INFO(env, "This will take a while and will take 10s of gigabytes of RAM."); { LOG_INFO(env, "Directories pass"); uint64_t analyzedDirs = 0; std::unique_ptr it(db->NewIterator(options, directoriesCf)); for (it->SeekToFirst(); it->Valid(); it->Next()) { analyzedDirs++; auto dirK = ExternalValue::FromSlice(it->key()); if (dirK().id().type() != InodeType::DIRECTORY) { ERROR("Found key with bad directory inode %s", dirK().id()); continue; } auto _ = ExternalValue::FromSlice(it->value()); // just to check that this works } ROCKS_DB_CHECKED(it->status()); LOG_INFO(env, "Analyzed %s directories", analyzedDirs); } // no duplicate ownership of any kind in edges -- currently broken in shard 0 // because of the outage which lead to the fork bool checkEdgeOrdering = dbPath.find("000") == std::string::npos; if (!checkEdgeOrdering) { LOG_INFO(env, "Will not check edge ordering since we think this is shard 0"); } { LOG_INFO(env, "Edges pass"); const rocksdb::ReadOptions options; std::unordered_set ownedInodes; auto dummyCurrentDirectory = InodeId::FromU64Unchecked(1ull << 63); auto thisDir = dummyCurrentDirectory; bool thisDirHasCurrentEdges = false; EggsTime thisDirMaxTime = 0; std::string thisDirMaxTimeEdge; // the last edge for a given name, in a given directory std::unordered_map, StaticValue>> thisDirLastEdges; uint64_t analyzedEdges = 0; uint64_t analyzedDirectories = 0; std::unique_ptr it(db->NewIterator(options, edgesCf)); for (it->SeekToFirst(); it->Valid(); it->Next()) { analyzedEdges++; auto edgeK = ExternalValue::FromSlice(it->key()); if (edgeK().dirId().type() != InodeType::DIRECTORY) { ERROR("Found edge with bad inode id %s", edgeK().dirId()); continue; } if (edgeK().dirId() != thisDir) { analyzedDirectories++; if (thisDir != dummyCurrentDirectory) { auto dirIdK = InodeIdKey::Static(thisDir); std::string dirValue; auto status = db->Get(options, directoriesCf, dirIdK.toSlice(), &dirValue); if (status.IsNotFound()) { // the directory must exist ERROR("Directory %s is referenced in edges, but it does not exist.", thisDir); } else { ROCKS_DB_CHECKED(status); auto dir = ExternalValue(dirValue); // the mtime must be greater or equal than all edges if (dir().mtime() < thisDirMaxTime) { ERROR("Directory %s has time %s, but max edge with name %s has greater time %s", thisDir, dir().mtime(), thisDirMaxTimeEdge, thisDirMaxTime); } // if we have current edges, the directory can't be snapshot if (thisDirHasCurrentEdges && thisDir != ROOT_DIR_INODE_ID && dir().ownerId() == NULL_INODE_ID) { ERROR("Directory %s has current edges, but is snaphsot (no owner)", thisDir); } } } thisDir = edgeK().dirId(); thisDirLastEdges.clear(); thisDirHasCurrentEdges = false; thisDirMaxTime = 0; } std::string name(edgeK().name().data(), edgeK().name().size()); InodeId ownedTargetId = NULL_INODE_ID; EggsTime creationTime; std::optional> currentEdge; std::optional> snapshotEdge; if (edgeK().current()) { currentEdge = ExternalValue::FromSlice(it->value()); ownedTargetId = (*currentEdge)().targetId(); creationTime = (*currentEdge)().creationTime(); thisDirHasCurrentEdges = true; } else { snapshotEdge = ExternalValue::FromSlice(it->value()); creationTime = edgeK().creationTime(); if ((*snapshotEdge)().targetIdWithOwned().extra()) { ownedTargetId = (*snapshotEdge)().targetIdWithOwned().id(); // we can't have an owned directory snapshot edge if (ownedTargetId.type() == InodeType::DIRECTORY) { ERROR("Directory %s has a snapshot, owned edge to directory %s", thisDir, ownedTargetId); } } } if (ownedTargetId != NULL_INODE_ID) { if (ownedInodes.contains(ownedTargetId)) { ERROR("Inode %s is owned twice!", ownedTargetId); } ownedInodes.emplace(ownedTargetId); } if (creationTime > thisDirMaxTime) { thisDirMaxTime = creationTime; thisDirMaxTimeEdge = name; } // No intra directory dangling pointers if (ownedTargetId != NULL_INODE_ID && ownedTargetId.shard() == thisDir.shard()) { auto idK = InodeIdKey::Static(ownedTargetId); if (ownedTargetId.type() == InodeType::DIRECTORY) { std::string tmp; auto status = db->Get(options, directoriesCf, idK.toSlice(), &tmp); if (status.IsNotFound()) { ERROR("Directory %s is referenced in directory %s but I could not find it.", ownedTargetId, edgeK().dirId()); } else { ROCKS_DB_CHECKED(status); } } else { std::string tmp; auto status = db->Get(options, filesCf, idK.toSlice(), &tmp); if (status.IsNotFound()) { ERROR("File %s is referenced in directory %s but I could not find it.", ownedTargetId, edgeK().dirId()); } else { ROCKS_DB_CHECKED(status); } } } { auto prevEdge = thisDirLastEdges.find(name); if (prevEdge != thisDirLastEdges.end()) { EggsTime prevCreationTime = prevEdge->second.first().creationTime(); // The edge must be newer than every non-current edge before it, with the exception of deletion edges // (when we override the deletion edge) if ( (prevCreationTime > creationTime) || (prevCreationTime == creationTime && !(snapshotEdge && (*snapshotEdge)().targetIdWithOwned().id() == NULL_INODE_ID)) ) { if (checkEdgeOrdering) { ERROR("Name %s in directory %s has overlapping edges (%s >= %s).", name, edgeK().dirId(), prevCreationTime, creationTime); } } // Deletion edges are not preceded by another deletion edge if (snapshotEdge && (*snapshotEdge)().targetIdWithOwned().id() == NULL_INODE_ID && prevEdge->second.second().targetIdWithOwned().id() == NULL_INODE_ID) { if (checkEdgeOrdering) { ERROR("Deletion edge with name %s creation time %s is preceded by another deletion edge in directory %s", name, edgeK().creationTime(), thisDir); } } } else { // Deletion edges are preceded by something not -- this is actually not enforced // by the schema, but by GC. So it's not a huge deal if it's wrong. if (snapshotEdge && (*snapshotEdge)().targetIdWithOwned().id() == NULL_INODE_ID) { if (checkEdgeOrdering) { ERROR("Deletion edge with name %s creation time %s is not preceded by anything in directory %s", name, edgeK().creationTime(), thisDir); } } } } // Add last edge (only snapshot edges can have predecessors) if (snapshotEdge) { StaticValue staticEdgeK(edgeK()); StaticValue staticEdgeV((*snapshotEdge)()); thisDirLastEdges[name] = {staticEdgeK, staticEdgeV}; } } ROCKS_DB_CHECKED(it->status()); LOG_INFO(env, "Analyzed %s edges in %s directories", analyzedEdges, analyzedDirectories); } struct HashBlockServiceInodeId { std::size_t operator () (const std::pair& k) const { return std::hash{}(k.first.u64) ^ std::hash{}(k.second.u64); } }; std::unordered_map, int64_t, HashBlockServiceInodeId> blockServicesToFiles; std::unordered_map filesWithSpans; // file to expected size { LOG_INFO(env, "Spans pass"); std::unique_ptr it(db->NewIterator(options, spansCf)); InodeId thisFile = NULL_INODE_ID; uint64_t expectedNextOffset = 0; uint64_t analyzedSpans = 0; uint64_t analyzedFiles = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { analyzedSpans++; auto spanK = ExternalValue::FromSlice(it->key()); if (spanK().fileId().type() != InodeType::FILE && spanK().fileId().type() != InodeType::SYMLINK) { ERROR("Found span with bad file id %s", spanK().fileId()); continue; } if (spanK().fileId() != thisFile) { analyzedFiles++; expectedNextOffset = 0; thisFile = spanK().fileId(); } // check that file exists (either transient or non) { auto idK = InodeIdKey::Static(thisFile); std::string tmp; auto status = db->Get(options, filesCf, idK.toSlice(), &tmp); if (status.IsNotFound()) { auto status = db->Get(options, transientFilesCf, idK.toSlice(), &tmp); if (status.IsNotFound()) { ERROR("We have a span for non-esistant file %s", thisFile); } else { ROCKS_DB_CHECKED(status); } } else { ROCKS_DB_CHECKED(status); } } // Record blocks auto spanV = ExternalValue::FromSlice(it->value()); filesWithSpans[spanK().fileId()] = spanK().offset() + spanV().spanSize(); if (spanK().offset() != expectedNextOffset) { ERROR("Expected offset %s, got %s in span for file %s", expectedNextOffset, spanK().offset(), thisFile); } expectedNextOffset = spanK().offset() + spanV().spanSize(); if (spanV().storageClass() == EMPTY_STORAGE) { ERROR("File %s has empty storage span at offset %s", thisFile, spanK().offset()); } else if (spanV().storageClass() != INLINE_STORAGE) { const auto& spanBlock = spanV().blocksBody(); for (int i = 0; i < spanBlock.parity().blocks(); i++) { auto block = spanBlock.block(i); blockServicesToFiles[std::pair(block.blockService(), thisFile)] += 1; } } } ROCKS_DB_CHECKED(it->status()); LOG_INFO(env, "Analyzed %s spans in %s files", analyzedSpans, analyzedFiles); } const auto filesPass = [&env, &anyErrors, db, &options, &filesWithSpans](rocksdb::ColumnFamilyHandle* cf, const std::string& what) { uint64_t analyzedFiles = 0; std::unique_ptr it(db->NewIterator(options, cf)); for (it->SeekToFirst(); it->Valid(); it->Next()) { analyzedFiles++; auto fileK = ExternalValue::FromSlice(it->key()); InodeId fileId = fileK().id(); if (fileId.type() != InodeType::FILE && fileId.type() != InodeType::SYMLINK) { ERROR("Found %s with bad file id %s", what, fileId); continue; } auto fileV = ExternalValue::FromSlice(it->value()); if (fileV().fileSize() == 0) { continue; } // nothing in this auto spansSize = filesWithSpans.find(fileId); if (spansSize == filesWithSpans.end()) { ERROR("Could not find spans for non-zero-sized file %s", fileId); } else if (spansSize->second != fileV().fileSize()) { ERROR("Spans tell us file %s should be of size %s, but it actually has size %s", fileId, spansSize->second, fileV().fileSize()); } } ROCKS_DB_CHECKED(it->status()); LOG_INFO(env, "Analyzed %s %s", analyzedFiles, what); }; LOG_INFO(env, "Files pass"); filesPass.template operator()(filesCf, "files"); LOG_INFO(env, "Transient files pass"); filesPass.template operator()(transientFilesCf, "transient files"); LOG_INFO(env, "Block services to files pass"); { uint64_t foundItems = 0; std::unique_ptr it(db->NewIterator(options, blockServicesToFilesCf)); for (it->SeekToFirst(); it->Valid(); it->Next()) { auto k = ExternalValue::FromSlice(it->key()); if (k().fileId().type() != InodeType::FILE && k().fileId().type() != InodeType::SYMLINK) { ERROR("Found block service to files with bad file id %s", k().fileId()); continue; } auto v = ExternalValue::FromSlice(it->value()); if (v().i64() != 0) { foundItems++; } auto expected = blockServicesToFiles.find(std::pair(k().blockServiceId(), k().fileId())); if (expected == blockServicesToFiles.end() && v().i64() != 0) { ERROR("Found spurious block services to file entry for block service %s, file %s, with count %s", k().blockServiceId(), k().fileId(), v().i64()); } if (expected != blockServicesToFiles.end() && v().i64() != expected->second) { ERROR("Found wrong block services to file entry for block service %s, file %s, expected count %s, got %s", k().blockServiceId(), k().fileId(), expected->second, v().i64()); } } ROCKS_DB_CHECKED(it->status()); if (foundItems != blockServicesToFiles.size()) { ERROR("Expected %s block services to files, got %s", blockServicesToFiles.size(), foundItems); } LOG_INFO(env, "Analyzed %s block service to files entries", foundItems); } ALWAYS_ASSERT(!anyErrors, "Some errors detected, check log"); #undef ERROR }