Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

View File

@@ -0,0 +1,183 @@
#include "storage/service/BufferPool.h"
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/experimental/coro/Task.h>
#include <sys/uio.h>
#include "common/monitor/Recorder.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/MagicEnum.hpp"
#include "fbs/storage/Common.h"
namespace hf3fs::storage {
namespace {
void alignBuffer(net::RDMABuf &rdmabuf) {
auto address = reinterpret_cast<uint64_t>(rdmabuf.ptr());
auto remain = address % kAIOAlignSize;
if (remain == 0) {
return;
}
auto crop = kAIOAlignSize - remain;
rdmabuf.advance(std::min(crop, rdmabuf.size()));
}
} // namespace
Result<Void> BufferPool::init(CPUExecutorGroup &executor) {
buffers_.clear();
buffers_.reserve(UIO_MAXIOV);
auto smallBufferResult =
initBuffers(executor, config_.rdmabuf_size(), config_.rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
RETURN_AND_LOG_ON_ERROR(smallBufferResult);
*freeIndex_.lock() = std::move(*smallBufferResult);
bigBufferRegisterIndexStart_ = buffers_.size();
auto bigBufferResult =
initBuffers(executor, config_.big_rdmabuf_size(), config_.big_rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
RETURN_AND_LOG_ON_ERROR(bigBufferResult);
*bigFreeIndex_.lock() = std::move(*bigBufferResult);
iovecs_.clear();
iovecs_.reserve(buffers_.size());
for (auto &buf : buffers_) {
iovecs_.push_back({(void *)buf.ptr(), buf.size()});
}
return Void{};
}
Result<std::vector<BufferIndex>> BufferPool::initBuffers(CPUExecutorGroup &executor,
Size rdmabufSize,
uint32_t rdmabufCount,
uint32_t limit,
std::vector<net::RDMABuf> &outBuffers) {
size_t totalSize = rdmabufSize * rdmabufCount;
size_t bufferCount = std::min(limit, rdmabufCount);
size_t smallBufferCount = (totalSize / bufferCount + rdmabufSize - 1) / rdmabufSize;
size_t bufferSize = smallBufferCount * rdmabufSize;
auto pool = net::RDMABufPool::create(bufferSize, bufferCount);
std::vector<folly::coro::TaskWithExecutor<net::RDMABuf>> tasks;
tasks.reserve(bufferCount);
for (auto i = 0u; i < bufferCount; ++i) {
tasks.push_back(pool->allocate().scheduleOn(&executor.pickNext()));
}
XLOGF(INFO, "allocate {} * {} RDMA buffers started", bufferCount, Size{bufferSize});
auto buffers = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
XLOGF(INFO, "allocate {} * {} RDMA buffers finished", bufferCount, Size{bufferSize});
std::vector<BufferIndex> freeIndex;
freeIndex.reserve(rdmabufCount);
for (auto &buf : buffers) {
if (UNLIKELY(!buf)) {
auto msg = fmt::format("storage init buffer pool failed");
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
alignBuffer(buf);
BufferIndex bufferIndex;
bufferIndex.registerIndex = outBuffers.size();
outBuffers.push_back(buf);
auto split = buf;
for (; split.size() >= rdmabufSize; split.advance(rdmabufSize)) {
bufferIndex.buffer = split.first(rdmabufSize);
freeIndex.push_back(bufferIndex);
}
}
return Result<std::vector<BufferIndex>>(std::move(freeIndex));
}
BufferPool::Buffer::~Buffer() {
for (auto &index : indices_) {
pool_->deallocate(index);
}
}
Result<net::RDMABuf> BufferPool::Buffer::tryAllocate(uint32_t size) {
if (indices_.empty() || current_.size() < size) {
if (UNLIKELY(size > pool_->rdmabufSize_)) {
return makeError(StorageCode::kBufferSizeExceeded);
}
if (LIKELY(pool_->semaphore_.try_wait())) {
auto index = pool_->allocate();
indices_.push_back(index);
current_ = index.buffer;
} else {
return makeError(RPCCode::kRDMANoBuf);
}
}
auto ret = current_.takeFirst(size);
assert(ret);
alignBuffer(current_);
return ret;
}
CoTryTask<net::RDMABuf> BufferPool::Buffer::allocate(uint32_t size) {
if (indices_.empty() || current_.size() < size) {
if (UNLIKELY(size > pool_->bigRdmabufSize_)) {
co_return makeError(StorageCode::kBufferSizeExceeded);
} else if (UNLIKELY(size > pool_->rdmabufSize_)) {
co_await pool_->bigSemaphore_.co_wait();
auto index = pool_->allocateBig();
indices_.push_back(index);
current_ = index.buffer;
} else {
co_await pool_->semaphore_.co_wait();
auto index = pool_->allocate();
indices_.push_back(index);
current_ = index.buffer;
}
}
auto ret = current_.takeFirst(size);
assert(ret);
alignBuffer(current_);
co_return ret;
}
void BufferPool::clear(CPUExecutorGroup &executor) {
std::vector<folly::coro::TaskWithExecutor<void>> tasks;
tasks.reserve(buffers_.size());
for (auto &buffer : buffers_) {
tasks.push_back(folly::coro::co_invoke([&, buf = std::move(buffer)]() mutable -> CoTask<void> {
buf = {};
co_return;
}).scheduleOn(&executor.pickNext()));
}
XLOGF(INFO, "deallocate {} RDMA buffers started", buffers_.size());
folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
XLOGF(INFO, "deallocate {} RDMA buffers finished", buffers_.size());
}
BufferIndex BufferPool::allocate() {
auto guard = freeIndex_.lock();
assert(!guard->empty());
auto ret = guard->back();
guard->pop_back();
return ret;
}
BufferIndex BufferPool::allocateBig() {
auto guard = bigFreeIndex_.lock();
assert(!guard->empty());
auto ret = guard->back();
guard->pop_back();
return ret;
}
void BufferPool::deallocate(const BufferIndex &index) {
if (UNLIKELY(index.registerIndex >= bigBufferRegisterIndexStart_)) {
bigFreeIndex_.lock()->push_back(index);
bigSemaphore_.signal();
} else {
freeIndex_.lock()->push_back(index);
semaphore_.signal();
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,92 @@
#pragma once
#include <folly/Synchronized.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/fibers/Semaphore.h>
#include <limits>
#include "common/net/ib/RDMABuf.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/ConstructLog.h"
#include "common/utils/Size.h"
namespace hf3fs::storage {
struct BufferIndex {
uint32_t registerIndex;
net::RDMABuf buffer;
};
class BufferPool {
public:
class Config : public ConfigBase<Config> {
CONFIG_ITEM(rdmabuf_size, 4_MB);
CONFIG_ITEM(rdmabuf_count, 1024u);
CONFIG_ITEM(big_rdmabuf_size, 64_MB);
CONFIG_ITEM(big_rdmabuf_count, 64u);
};
BufferPool(const Config &config)
: config_(config),
rdmabufSize_(config_.rdmabuf_size()),
semaphore_(config_.rdmabuf_count()),
bigRdmabufSize_(config_.big_rdmabuf_size()),
bigSemaphore_(config_.big_rdmabuf_count()) {}
Result<Void> init(CPUExecutorGroup &executor);
auto &iovecs() const { return iovecs_; }
class Buffer {
public:
explicit Buffer(BufferPool &pool)
: pool_(&pool) {}
Buffer(const Buffer &) = delete;
Buffer(Buffer &&other) = default;
Buffer &operator=(Buffer &&other) = default;
~Buffer();
Result<net::RDMABuf> tryAllocate(uint32_t size);
CoTryTask<net::RDMABuf> allocate(uint32_t size);
auto index() const { return indices_.back().registerIndex; }
private:
BufferPool *pool_{};
std::vector<BufferIndex> indices_;
net::RDMABuf current_;
};
auto get() { return Buffer{*this}; }
void clear(CPUExecutorGroup &executor);
protected:
static Result<std::vector<BufferIndex>> initBuffers(CPUExecutorGroup &executor,
Size rdmabufSize,
uint32_t rdmabufCount,
uint32_t limit,
std::vector<net::RDMABuf> &outBuffers);
BufferIndex allocate();
BufferIndex allocateBig();
void deallocate(const BufferIndex &index);
private:
ConstructLog<"storage::BufferPool"> constructLog_;
const Config &config_;
Size rdmabufSize_;
std::vector<net::RDMABuf> buffers_;
std::vector<struct iovec> iovecs_;
folly::fibers::Semaphore semaphore_;
folly::Synchronized<std::vector<BufferIndex>, std::mutex> freeIndex_;
Size bigRdmabufSize_;
uint32_t bigBufferRegisterIndexStart_ = 0;
folly::fibers::Semaphore bigSemaphore_;
folly::Synchronized<std::vector<BufferIndex>, std::mutex> bigFreeIndex_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,263 @@
#include "storage/service/Components.h"
#include <folly/experimental/coro/BlockingWait.h>
#include "common/app/ApplicationBase.h"
#include "common/monitor/Recorder.h"
#include "common/utils/LogCommands.h"
#include "stubs/common/RealStubFactory.h"
#include "stubs/mgmtd/MgmtdServiceStub.h"
namespace hf3fs::storage {
namespace {
constexpr std::string_view kRoutingInfoListenerName = "Components";
monitor::ValueRecorder targetStateRecorder{"storage.target_state", std::nullopt, false};
} // namespace
Components::Components(const Config &config)
: config(config),
rdmabufPool(config.buffer_pool()),
storageTargets(config.targets(), targetMap),
aioReadWorker(config.aio_read_worker()),
messenger(config.forward_client()),
resyncWorker(config.sync_worker(), *this),
checkWorker(config.check_worker(), *this),
dumpWorker(config.dump_worker(), *this),
allocateWorker(config.allocate_worker(), *this),
punchHoleWorker(*this),
syncMetaKvWorker(config.sync_meta_kv_worker(), *this),
reliableForwarding(config.reliable_forwarding(), *this),
readPool(config.coroutines_pool_read(), "ReadPool"),
updatePool(config.coroutines_pool_update(), "UpdatePool"),
syncPool(config.coroutines_pool_default(), "SyncPool"),
defaultPool(config.coroutines_pool_default(), "DefaultPool"),
storageOperator(config.storage(), *this),
reliableUpdate(config.reliable_update(), *this) {}
Result<Void> Components::start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg) {
this->appInfo = appInfo;
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start rdmabufPool", rdmabufPool.init(tpg.procThreadPool()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start readPool", readPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start updatePool", updatePool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncPool", syncPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start defaultPool", defaultPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start messenger", messenger.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start reliableForwarding", reliableForwarding.init());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageTargets", storageTargets.load(tpg.procThreadPool()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
"Start aioReadWorker",
aioReadWorker.start(storageTargets.fds(), rdmabufPool.iovecs()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start dumpWorker", dumpWorker.start(appInfo.nodeId));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start allocateWorker", allocateWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start punchHoleWorker", punchHoleWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncMetaKvWorker", syncMetaKvWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start waitRoutingInfo", waitRoutingInfo(appInfo, tpg.bgThreadPool().randomPick()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start resyncWorker", resyncWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
"Start checkWorker",
checkWorker.start(storageTargets.targetPaths(), storageTargets.manufacturers()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageOperator", storageOperator.init(storageTargets.targetPaths().size()));
return Void{};
}
Result<Void> Components::waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor) {
// 1. init mgdtd client.
if (!netClient) {
netClient = std::make_unique<net::Client>(config.client());
RETURN_AND_LOG_ON_ERROR(netClient->start());
}
if (mgmtdClient.load() == nullptr) {
auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(
hf3fs::stubs::ClientContextCreator{[&](net::Address addr) { return netClient->serdeCtx(addr); }});
mgmtdClient = std::make_shared<hf3fs::client::MgmtdClientForServer>(appInfo.clusterId,
std::move(stubFactory),
config.mgmtd());
}
mgmtdClient.load()->setAppInfoForHeartbeat(appInfo);
mgmtdClient.load()->setConfigListener(ApplicationBase::updateConfig);
// 2. wait target offline.
auto currentMap = targetMap.snapshot();
updateHeartbeatPayload(*currentMap, true);
folly::coro::blockingWait(mgmtdClient.load()->start(&executor));
for (auto sleep = 0;; ++sleep) {
if (sleep) {
XLOGF(WARNING, "Waiting for target offline in routing info...");
std::this_thread::sleep_for(1000_ms);
}
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
auto copy = currentMap->clone();
auto refreshResult = folly::coro::blockingWait(mgmtdClient.load()->refreshRoutingInfo(false));
if (UNLIKELY(!refreshResult)) {
XLOGF(ERR, "refresh routing info error {}", refreshResult.error());
continue;
}
auto result = copy->updateRouting(mgmtdClient.load()->getRoutingInfo(), false);
if (UNLIKELY(!result)) {
XLOGF(ERR, "get and parse routing info error {}", result.error());
continue;
}
bool needWaiting = false;
for (auto &[targetId, target] : copy->getTargets()) {
if (target.publicState == flat::PublicTargetState::SERVING ||
target.publicState == flat::PublicTargetState::SYNCING ||
target.publicState == flat::PublicTargetState::WAITING) {
XLOGF(WARNING, "waiting for chain {} target {}", targetId, serde::toJsonString(target));
needWaiting = true;
break;
}
}
if (!needWaiting) {
break;
}
}
// 3. set listener.
targetMap.setUpdateCallback([this](const TargetMap &map) { updateHeartbeatPayload(map); });
RETURN_AND_LOG_ON_ERROR(refreshRoutingInfo());
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
XLOGF(INFO, "Initial target map: {}", serde::toJsonString(targetMap.snapshot()->getTargets()));
bool succ = mgmtdClient.load()->addRoutingInfoListener(std::string{kRoutingInfoListenerName},
[this](auto) { refreshRoutingInfo(); });
if (UNLIKELY(!succ)) {
auto msg = fmt::format("node {} addRoutingInfoListener failed!", appInfo.nodeId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
return Void{};
}
Result<Void> Components::refreshRoutingInfo() { return targetMap.updateRouting(mgmtdClient.load()->getRoutingInfo()); }
Result<Void> Components::stopAndJoin(CPUExecutorGroup &executor) {
LOG_COMMAND(INFO, "Stop aioReadWorker", aioReadWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop syncMetaKvWorker", syncMetaKvWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop punchHoleWorker", punchHoleWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop allocateWorker", allocateWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop dumpWorker", dumpWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop checkWorker", checkWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop resyncWorker", resyncWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop storageOperator", storageOperator.stopAndJoin());
LOG_COMMAND(INFO, "Stop reliableForwarding", reliableForwarding.stopAndJoin());
LOG_COMMAND(INFO, "Stop messenger", messenger.stopAndJoin());
targetMap.setUpdateCallback([](auto) {});
XLOGF(INFO, "Send offline state");
if (auto mgmtd = mgmtdClient.load()) {
mgmtd->removeRoutingInfoListener(kRoutingInfoListenerName);
updateHeartbeatPayload(*targetMap.snapshot(), true);
folly::coro::blockingWait(mgmtd->heartbeat());
}
LOG_COMMAND(INFO, "Stop routingStore", stopMgmtdClient());
LOG_COMMAND(INFO, "Stop readPool", readPool.stopAndJoin());
LOG_COMMAND(INFO, "Stop updatePool", updatePool.stopAndJoin());
LOG_COMMAND(INFO, "Stop syncPool", syncPool.stopAndJoin());
LOG_COMMAND(INFO, "Stop defaultPool", defaultPool.stopAndJoin());
auto snapshot = targetMap.release();
std::vector<std::shared_ptr<StorageTarget>> targets;
for (auto &[targetId, target] : snapshot->getTargets()) {
if (target.storageTarget != nullptr) {
targets.push_back(target.storageTarget);
}
}
LOG_COMMAND(INFO, "Reset target map", snapshot.reset());
XLOGF(WARNING, "start to release {} targets", targets.size());
std::atomic<uint32_t> released{};
std::atomic<uint32_t> synced{};
for (auto &target : targets) {
executor.randomPick().add([&, t = std::move(target)]() mutable {
auto result = t->release();
if (UNLIKELY(!result)) {
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", t->path(), result.error());
} else {
++synced;
}
t = nullptr;
++released;
});
}
for (int i = 0; released != targets.size(); ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for release targets finished...");
std::this_thread::sleep_for(100_ms);
}
XLOGF(WARNING, "released {} targets, synced {} targets", released.load(), synced.load());
LOG_COMMAND(INFO, "Clear storageTargets", storageTargets.globalFileStore().clear(executor));
LOG_COMMAND(INFO, "Clear rdmabufPool", rdmabufPool.clear(executor));
if (config.speed_up_quit()) {
for (auto &engine : storageTargets.engines()) {
engine->speed_up_quit();
}
}
return Void{};
}
Result<Void> Components::stopMgmtdClient() {
if (mgmtdClient.load()) {
folly::coro::blockingWait(mgmtdClient.load()->stop());
}
mgmtdClient.store(nullptr);
if (netClient) {
netClient->stopAndJoin();
}
netClient.reset();
return Void{};
}
Result<robin_hood::unordered_set<std::string>> Components::getActiveClientsList() {
auto result = folly::coro::blockingWait(mgmtdClient.load()->listClientSessions());
RETURN_AND_LOG_ON_ERROR(result);
if (result->bootstrapping) {
auto msg = fmt::format("mgmtd is bootstrapping, skip");
XLOG(WARNING, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
robin_hood::unordered_set<std::string> activeClients;
for (auto &client : result->sessions) {
activeClients.emplace(std::move(client.clientId));
}
return Result<robin_hood::unordered_set<std::string>>(std::move(activeClients));
}
void Components::triggerHeartbeatIfNeed() {
if (triggerHeartbeatFlag.exchange(0)) {
mgmtdClient.load()->triggerHeartbeat();
}
}
void Components::updateHeartbeatPayload(const TargetMap &targetMap, bool offline /* = false */) {
flat::StorageHeartbeatInfo heartbeat;
for (auto &[targetId, target] : targetMap.getTargets()) {
flat::LocalTargetInfo targetInfo;
targetInfo.targetId = targetId;
targetInfo.localState = offline ? flat::LocalTargetState::OFFLINE : target.localState;
targetInfo.diskIndex = target.diskIndex;
targetInfo.lowSpace = target.lowSpace;
monitor::TagSet tag;
tag.addTag("instance", fmt::format("{}", targetId));
targetStateRecorder.set(uint32_t(target.localState), tag);
if (targetInfo.localState != flat::LocalTargetState::OFFLINE) {
targetInfo.usedSize = target.storageTarget->usedSize();
targetInfo.chainVersion = target.vChainId.chainVer;
}
heartbeat.targets.push_back(targetInfo);
}
mgmtdClient.load()->updateHeartbeatPayload(heartbeat);
++triggerHeartbeatFlag;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,124 @@
#pragma once
#include <folly/concurrency/AtomicSharedPtr.h>
#include "client/mgmtd/MgmtdClientForServer.h"
#include "client/storage/StorageMessenger.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/DynamicCoroutinesPool.h"
#include "common/utils/LockManager.h"
#include "common/utils/RobinHood.h"
#include "fbs/storage/Service.h"
#include "storage/aio/AioReadWorker.h"
#include "storage/service/BufferPool.h"
#include "storage/service/StorageOperator.h"
#include "storage/service/TargetMap.h"
#include "storage/store/StorageTargets.h"
#include "storage/sync/ResyncWorker.h"
#include "storage/worker/AllocateWorker.h"
#include "storage/worker/CheckWorker.h"
#include "storage/worker/DumpWorker.h"
#include "storage/worker/PunchHoleWorker.h"
#include "storage/worker/SyncMetaKvWorker.h"
namespace hf3fs::storage {
class ReliableForwarding;
struct Components {
struct Config : public ConfigBase<Config> {
CONFIG_OBJ(base, net::Server::Config, [](net::Server::Config &c) {
c.set_groups_length(2);
c.groups(0).listener().set_listen_port(8000);
c.groups(0).set_network_type(net::Address::RDMA);
c.groups(0).set_services({"StorageSerde"});
c.groups(1).set_network_type(net::Address::TCP);
c.groups(1).listener().set_listen_port(9000);
c.groups(1).set_use_independent_thread_pool(true);
c.groups(1).set_services({"Core"});
c.thread_pool().set_num_io_threads(32);
c.thread_pool().set_num_proc_threads(32);
});
CONFIG_OBJ(client, net::Client::Config);
CONFIG_OBJ(mgmtd, hf3fs::client::MgmtdClientForServer::Config);
CONFIG_OBJ(targets, StorageTargets::Config);
CONFIG_OBJ(storage, StorageOperator::Config);
CONFIG_OBJ(reliable_forwarding, ReliableForwarding::Config);
CONFIG_OBJ(reliable_update, ReliableUpdate::Config);
CONFIG_OBJ(buffer_pool, BufferPool::Config);
CONFIG_OBJ(aio_read_worker, AioReadWorker::Config);
CONFIG_OBJ(sync_worker, ResyncWorker::Config);
CONFIG_OBJ(check_worker, CheckWorker::Config);
CONFIG_OBJ(dump_worker, DumpWorker::Config);
CONFIG_OBJ(allocate_worker, AllocateWorker::Config);
CONFIG_OBJ(sync_meta_kv_worker, SyncMetaKvWorker::Config);
CONFIG_OBJ(forward_client, net::Client::Config);
CONFIG_OBJ(coroutines_pool_read, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_update, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_sync, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_default, DynamicCoroutinesPool::Config);
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_read, true);
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_update, true);
CONFIG_HOT_UPDATED_ITEM(speed_up_quit, true);
};
Components(const Config &config);
Result<Void> start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg);
Result<Void> waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor);
Result<Void> refreshRoutingInfo();
Result<Void> stopAndJoin(CPUExecutorGroup &executor);
Result<Void> stopMgmtdClient();
const flat::AppInfo &getAppInfo() const { return appInfo; }
Result<robin_hood::unordered_set<std::string>> getActiveClientsList();
void triggerHeartbeatIfNeed();
inline DynamicCoroutinesPool &getCoroutinesPool(uint16_t methodId) {
if (LIKELY(config.use_coroutines_pool_read()) && methodId == StorageSerde<>::batchReadMethodId) {
return readPool;
}
if (LIKELY(config.use_coroutines_pool_update()) &&
(methodId == StorageSerde<>::writeMethodId || methodId == StorageSerde<>::updateMethodId)) {
return updatePool;
}
if (methodId == StorageSerde<>::syncStartMethodId || methodId == StorageSerde<>::getAllChunkMetadataMethodId) {
return syncPool;
}
return defaultPool;
}
protected:
void updateHeartbeatPayload(const TargetMap &map, bool offline = false);
public:
ConstructLog<"storage::Components"> constructLog_;
const Config &config;
flat::AppInfo appInfo;
std::unique_ptr<net::Client> netClient;
folly::atomic_shared_ptr<hf3fs::client::IMgmtdClientForServer> mgmtdClient;
BufferPool rdmabufPool;
AtomicallyTargetMap targetMap;
StorageTargets storageTargets;
AioReadWorker aioReadWorker;
client::StorageMessenger messenger;
ResyncWorker resyncWorker;
CheckWorker checkWorker;
DumpWorker dumpWorker;
AllocateWorker allocateWorker;
PunchHoleWorker punchHoleWorker;
SyncMetaKvWorker syncMetaKvWorker;
ReliableForwarding reliableForwarding;
DynamicCoroutinesPool readPool;
DynamicCoroutinesPool updatePool;
DynamicCoroutinesPool syncPool;
DynamicCoroutinesPool defaultPool;
StorageOperator storageOperator;
ReliableUpdate reliableUpdate;
std::atomic<uint32_t> triggerHeartbeatFlag{};
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,281 @@
#include "storage/service/ReliableForwarding.h"
#include <folly/experimental/coro/Sleep.h>
#include "common/app/ApplicationBase.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/ExponentialBackoffRetry.h"
#include "fbs/storage/Common.h"
#include "storage/service/Components.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder reliableForwardRecorder("storage.reliable_forward");
monitor::OperationRecorder syncingReadRecorder("storage.syncing_read");
monitor::OperationRecorder updateRemoteRecorder("storage.update_remote");
monitor::CountRecorder forwardWriteBytes("storage.forward.write_bytes");
monitor::DistributionRecorder forwardWriteDist("storage.forward.write_dist");
monitor::CountRecorder forwardSyncingBytes("storage.forward.syncing_bytes");
monitor::DistributionRecorder forwardSyncingDist("storage.forward.syncing_dist");
} // namespace
using namespace std::chrono_literals;
Result<Void> ReliableForwarding::init() { return Void{}; }
Result<Void> ReliableForwarding::stopAndJoin() { return Void{}; }
CoTask<IOResult> ReliableForwarding::forwardWithRetry(ServiceRequestContext &requestCtx,
const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
bool allowOutdatedChainVer /* = true */) {
auto startTime = RelativeTime::now();
auto recordGuard = reliableForwardRecorder.record();
IOResult ioResult;
ExponentialBackoffRetry retry(config_.retry_first_wait().asMs(),
config_.retry_max_wait().asMs(),
config_.retry_total_time().asMs());
for (uint32_t retryCount = 0; !stopped_; ++retryCount) {
auto waitTime = retry.getWaitTime();
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
CO_RETURN_ON_ERROR(targetResult);
target = std::move(*targetResult);
auto ioResult = co_await forward(req, retryCount, rdmabuf, chunkEngineJob, target, commitIO, waitTime);
if (LIKELY(bool(ioResult.lengthInfo))) {
recordGuard.succ();
co_return ioResult;
} else if (ioResult.lengthInfo.error().code() == StorageCode::kNoSuccessorTarget) {
recordGuard.succ();
co_return ioResult;
}
// TODO(SF): fine-grained error handling.
auto code = ioResult.lengthInfo.error().code();
if (!allowOutdatedChainVer && code == StorageClientCode::kRoutingVersionMismatch) {
XLOGF(ERR,
"forwarding routing version mismatch, req {}, result {}, elapsed {}",
req,
ioResult,
(RelativeTime::now() - startTime).asMs());
co_return ioResult;
}
if (waitTime.count() == 0) {
XLOGF_IF(DFATAL,
!requestCtx.debugFlags.faultInjectionEnabled(),
"forwarding timeout with error, req {}, result {}",
req,
ioResult);
co_return ioResult;
} else if (code != RPCCode::kTimeout) {
XLOGF(WARNING,
"forwarding wait and retry, req {}, error {}, elapsed {}",
req,
ioResult,
(RelativeTime::now() - startTime).asMs());
constexpr auto checkInterval = 100ms;
for (auto elapsed = 0ms; elapsed < waitTime && !stopped_; elapsed += checkInterval) {
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
CO_RETURN_ON_ERROR(targetResult);
target = std::move(*targetResult);
if (!target->successor.has_value()) {
break;
}
co_await folly::coro::sleep(std::min(checkInterval, waitTime - elapsed));
}
}
}
auto msg = fmt::format("req is refused because of stopping, req {}", req);
XLOG(ERR, msg);
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
}
CoTask<IOResult> ReliableForwarding::forward(const UpdateReq &req,
uint32_t retryCount,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
std::chrono::milliseconds timeout) {
if (!target->successor.has_value()) {
// use the latest chain version.
commitIO.commitChainVer = target->vChainId.chainVer;
co_return makeError(StorageCode::kNoSuccessorTarget);
}
auto ioResult = co_await doForward(req, rdmabuf, chunkEngineJob, retryCount, *target, commitIO.isSyncing, timeout);
if (ioResult.lengthInfo) {
commitIO.commitVer = ioResult.commitVer;
// use successor's chain version.
commitIO.commitChainVer = ioResult.commitChainVer;
if (ioResult.commitChainVer > target->vChainId.chainVer) {
// the remote obtains a higher chain version, and the local need to obtain the latest version by retry.
auto msg = fmt::format("the remote obtains a higher chain version {} > current {}, req {}",
ioResult.commitChainVer,
target->vChainId.chainVer,
req);
XLOGF(WARNING, "{}", msg);
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
}
co_return ioResult;
}
CoTask<IOResult> ReliableForwarding::doForward(const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
uint32_t retryCount,
const Target &target,
bool &isSyncing,
std::chrono::milliseconds timeout) {
UpdateReq updateReq = req;
updateReq.options.fromClient = false;
updateReq.retryCount = retryCount;
updateReq.payload.rdmabuf = rdmabuf;
updateReq.payload.key.vChainId.chainVer = target.vChainId.chainVer;
auto buffer = components_.rdmabufPool.get();
isSyncing = target.successor->targetInfo.publicState == hf3fs::flat::PublicTargetState::SYNCING;
if (isSyncing) {
updateReq.options.isSyncing = true;
updateReq.options.commitChainVer = target.vChainId.chainVer;
}
bool readForSyncing = req.payload.isWriteTruncateExtend() && isSyncing &&
(req.options.isSyncing || req.payload.length != req.payload.chunkSize);
if (readForSyncing) {
auto recordGuard = syncingReadRecorder.record();
// read the entire chunk.
IOResult readResult;
auto allocateResult = buffer.tryAllocate(req.payload.chunkSize);
if (UNLIKELY(!allocateResult)) {
allocateResult = co_await buffer.allocate(req.payload.chunkSize);
}
if (UNLIKELY(!allocateResult)) {
readResult.lengthInfo = makeError(std::move(allocateResult.error()));
co_return readResult;
}
auto &readBuf = *allocateResult;
ReadIO payload;
payload.key = updateReq.payload.key;
payload.offset = 0;
payload.length = req.payload.chunkSize;
BatchReadJob batch(payload, target.storageTarget.get(), readResult, req.payload.checksum.type);
batch.setRecalculateChecksum();
batch.front().state().localbuf = readBuf;
batch.front().state().bufferIndex = buffer.index();
batch.front().state().readUncommitted = true;
if (chunkEngineJob.chunk()) {
batch.front().state().chunkEngineJob.set(nullptr, chunkEngineJob.chunk()->raw_chunk());
}
co_await components_.aioReadWorker.enqueue(&batch);
co_await batch.complete();
CO_RETURN_ON_ERROR(readResult.lengthInfo); // OK.
// clear the inline data if the update is built from full chunk read
if (BITFLAGS_CONTAIN(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE)) {
BITFLAGS_CLEAR(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE);
updateReq.payload.inlinebuf.data.clear();
}
auto length = *readResult.lengthInfo;
updateReq.payload.updateVer = readResult.updateVer;
if (req.options.isSyncing) {
updateReq.options.commitChainVer = batch.front().result().commitChainVer;
}
updateReq.payload.offset = 0;
updateReq.payload.length = length;
updateReq.payload.rdmabuf = readBuf.first(length).toRemoteBuf();
updateReq.payload.checksum = batch.front().state().chunkChecksum;
updateReq.payload.updateType = UpdateType::WRITE;
if (length <= config_.max_inline_forward_bytes()) {
updateReq.payload.inlinebuf.data.assign(readBuf.ptr(), readBuf.ptr() + length);
BITFLAGS_SET(updateReq.featureFlags, hf3fs::storage::FeatureFlags::SEND_DATA_INLINE);
}
recordGuard.succ();
} else if (isSyncing && !req.payload.isRemove() && chunkEngineJob.chunk() == nullptr) {
auto chunkResult = target.storageTarget->queryChunk(req.payload.key.chunkId);
if (UNLIKELY(!chunkResult)) {
XLOGF(ERR, "forward query chunk failed, req {}, error {}", updateReq, chunkResult.error());
co_return makeError(std::move(chunkResult.error()));
}
updateReq.payload.updateVer = chunkResult->updateVer;
}
auto recordGuard = updateRemoteRecorder.record();
auto addrResult = target.getSuccessorAddr();
if (UNLIKELY(!addrResult)) {
XLOGF(ERR, "target forward addr invalid, target {}", target);
co_return makeError(std::move(addrResult.error()));
}
net::UserRequestOptions reqOptions;
reqOptions.timeout = Duration{timeout};
auto updateResult = co_await components_.messenger.update(*addrResult, updateReq, &reqOptions);
if (UNLIKELY(!updateResult)) {
XLOGF(ERR, "forward timeout, req {}, result {}", updateReq, updateResult);
co_return makeError(std::move(updateResult.error()));
}
if (LIKELY(bool(updateResult->result.lengthInfo))) {
if (target.vChainId.chainVer < updateResult->result.commitChainVer) {
auto msg = fmt::format("chain version local < remote, req {} local {} remote {}",
updateReq,
target,
updateResult->result);
XLOG(ERR, msg);
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
auto length = *updateResult->result.lengthInfo;
monitor::TagSet tag;
tag.addTag("instance", fmt::format("{}", target.targetId));
if (isSyncing) {
updateResult->result.updateVer = req.payload.updateVer;
forwardSyncingBytes.addSample(length, tag);
forwardSyncingDist.addSample(length, tag);
} else {
forwardWriteBytes.addSample(length, tag);
forwardWriteDist.addSample(length, tag);
}
recordGuard.succ();
} else {
XLOGF(ERR, "forward failed, req {}, result {}", updateReq, updateResult->result);
auto errorCode = updateResult->result.lengthInfo.error().code();
if (errorCode == StorageCode::kChecksumMismatch) {
auto reqChecksum = updateReq.payload.checksum;
auto realChecksum = ChecksumInfo::create(reqChecksum.type,
(const uint8_t *)updateReq.payload.rdmabuf.addr(),
updateReq.payload.length);
if (reqChecksum != realChecksum) {
XLOGF(DFATAL,
"local rdma buffer is corrupted local {} != client {}, req: {}, kill self...",
realChecksum,
reqChecksum,
req);
ApplicationBase::handleSignal(SIGUSR2);
}
}
}
co_return updateResult->result;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,65 @@
#pragma once
#include "client/storage/StorageMessenger.h"
#include "common/net/Client.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/ConfigBase.h"
#include "fbs/storage/Common.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
struct Components;
struct Target;
class ReliableForwarding {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(retry_first_wait, 100_ms);
CONFIG_HOT_UPDATED_ITEM(retry_max_wait, 1000_ms);
CONFIG_HOT_UPDATED_ITEM(retry_total_time, 60_s);
CONFIG_HOT_UPDATED_ITEM(max_inline_forward_bytes, Size{});
};
ReliableForwarding(const Config &config, Components &components)
: config_(config),
components_(components) {}
Result<Void> init();
void beforeStop() { stopped_ = true; }
Result<Void> stopAndJoin();
CoTask<IOResult> forwardWithRetry(ServiceRequestContext &requestCtx,
const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
bool allowOutdatedChainVer = true);
CoTask<IOResult> forward(const UpdateReq &req,
uint32_t retryCount,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
std::chrono::milliseconds timeout);
CoTask<IOResult> doForward(const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
uint32_t retryCount,
const Target &target,
bool &isSyncing,
std::chrono::milliseconds timeout);
private:
ConstructLog<"storage::ReliableForwarding"> constructLog_;
const Config &config_;
Components &components_;
std::atomic<bool> stopped_ = false;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,158 @@
#include "storage/service/ReliableUpdate.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "storage/service/Components.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::storage {
monitor::OperationRecorder reliableUpdateRecorder{"storage.reliable_update"};
monitor::CountRecorder reliableUpdateWaited{"storage.reliable_update.waited"};
monitor::CountRecorder reliableUpdateDuplidate{"storage.reliable_update.duplicate"};
monitor::CountRecorder reliableUpdateCached{"storage.reliable_update.cached"};
monitor::OperationRecorder waitChannelLockRecorder{"storage.wait_channel_lock"};
CoTask<IOResult> ReliableUpdate::update(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target) {
XLOGF(DBG1, "Start reliable update, tag: {}, req: {}", req.tag, req);
if (UNLIKELY(stopped_)) {
auto msg = fmt::format("req is refused because of stopping, req {}", req);
XLOG(ERR, msg);
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
}
// 1. check if channel id is valid.
if (req.tag.channel.id == ChannelId{0}) {
XLOGF(DFATAL,
"{} request has invalid message tag {}: {}",
magic_enum::enum_name(req.payload.updateType),
req.tag,
req);
co_return makeError(StorageClientCode::kFoundBug);
}
// 2. get cached.
auto clientId = req.tag.clientId;
auto reqResult = shards_.withLock(
[&](ClientMap &map) {
auto &clientStatus = map[clientId];
if (clientStatus == nullptr) {
clientStatus = std::make_shared<ClientStatus>();
}
auto key = std::pair<ChainId, ChannelId>(req.payload.key.vChainId.chainId, req.tag.channel.id);
auto &reqResult = clientStatus->channelMap[key];
clientStatus->lastUsedTime = UtcClock::now();
return std::shared_ptr<ReqResult>(clientStatus, &reqResult);
},
clientId);
// 3. lock channel.
auto lockRecordGuard = waitChannelLockRecorder.record();
folly::coro::Baton baton;
auto lock = target->storageTarget->tryLockChannel(baton, fmt::format("{}:{}", clientId, req.tag.channel.id));
if (!lock.locked()) {
reliableUpdateWaited.addSample(1);
XLOGF(ERR, "Channel is locked, need retry, tag: {}, req: {}", req.tag, req);
co_return makeError(StorageCode::kChannelIsLocked);
}
lockRecordGuard.report(true);
IOResult updateResult;
if (req.tag.channel.seqnum < reqResult->channelSeqnum) {
reliableUpdateDuplidate.addSample(1);
XLOGF(WARN, "Find a duplicate update, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
co_return makeError(StorageClientCode::kDuplicateUpdate);
}
// 4. return cached result.
if (req.tag.channel.seqnum == reqResult->channelSeqnum &&
target->storageTarget->generationId() == reqResult->generationId) {
if (req.tag.requestId != reqResult->requestId) {
XLOGF(DFATAL,
"[BUG] Message tag {} is already assigned to another update, cached result: {}, req: {}",
req.tag,
*reqResult,
req);
co_return makeError(StorageClientCode::kFoundBug);
}
if (reqResult->updateResult.lengthInfo.hasValue()) {
if (req.payload.updateVer == 0 || req.payload.updateVer == reqResult->updateResult.updateVer) {
updateResult = reqResult->updateResult;
if (*updateResult.lengthInfo != req.payload.length && !req.payload.isExtend()) {
updateResult.lengthInfo = req.payload.length;
XLOGF(WARN,
"Cached length info {} not equal to write size in request {}, fixed update result: {}",
reqResult->updateResult.lengthInfo,
req,
updateResult);
}
reliableUpdateCached.addSample(1);
XLOGF(DBG1, "Return cached update result, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
co_return updateResult;
} else {
XLOGF(CRITICAL,
"Cached update version not equal to request update version, req:{}, cached result: {}",
req,
*reqResult);
}
} else if (req.payload.updateVer == 0 && !target->storageTarget->useChunkEngine() &&
reqResult->succUpdateVer != 0) {
XLOGF(CRITICAL, "Pick up previous update version, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
req.payload.updateVer = reqResult->succUpdateVer;
}
}
// 5. start a new task.
auto recordGuard = reliableUpdateRecorder.record();
updateResult = co_await components_.storageOperator.handleUpdate(requestCtx, req, ibSocket, target);
if (LIKELY(bool(updateResult.lengthInfo))) {
recordGuard.succ();
}
*reqResult = {req.tag.channel.seqnum,
req.tag.requestId,
updateResult,
req.payload.updateVer,
target->storageTarget->generationId()};
XLOGF(DBG1, "Completed reliable update, tag: {}, result: {}", req.tag, *reqResult);
co_return updateResult;
}
Result<Void> ReliableUpdate::cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients) {
if (!config_.clean_up_expired_clients()) {
return Void{};
}
if (activeClients.empty()) {
XLOGF(ERR, "activeClients is empty!");
return Void{};
}
auto allZero = ClientId::zero();
std::size_t cleanUpClientCount = 0;
shards_.iterate([&](ClientMap &map) {
auto now = UtcClock::now();
auto expiredClientsTimeout = config_.expired_clients_timeout();
for (auto it = map.begin(); it != map.end();) {
const auto &[clientId, clientStatus] = *it;
if (!activeClients.contains(clientId.uuid.toHexString()) && clientId != allZero &&
now >= clientStatus->lastUsedTime + expiredClientsTimeout) {
XLOGF(WARNING, "clean up expired client {}, last used time: {}", clientId, clientStatus->lastUsedTime);
it = map.erase(it);
++cleanUpClientCount;
} else {
++it;
}
}
});
XLOGF(WARNING, "clean up {} expired clients", cleanUpClientCount);
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,61 @@
#pragma once
#include "common/net/Transport.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/LockManager.h"
#include "common/utils/RobinHood.h"
#include "common/utils/Shards.h"
#include "common/utils/Size.h"
#include "fbs/storage/Common.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
struct Components;
class StorageOperator;
class ReliableUpdate {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(clean_up_expired_clients, false);
CONFIG_HOT_UPDATED_ITEM(expired_clients_timeout, 1_h);
};
ReliableUpdate(const Config &config, Components &components)
: config_(config),
components_(components) {}
CoTask<IOResult> update(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target);
Result<Void> cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients);
void beforeStop() { stopped_ = true; }
private:
ConstructLog<"storage::ReliableUpdate"> constructLog_;
const Config &config_;
Components &components_;
std::atomic<bool> stopped_ = false;
folly::coro::Mutex mutex_;
struct ReqResult {
SERDE_STRUCT_FIELD(channelSeqnum, ChannelSeqNum{0});
SERDE_STRUCT_FIELD(requestId, RequestId{0});
SERDE_STRUCT_FIELD(updateResult, IOResult{});
SERDE_STRUCT_FIELD(succUpdateVer, ChunkVer{});
SERDE_STRUCT_FIELD(generationId, uint32_t{});
};
struct ClientStatus {
std::unordered_map<std::pair<ChainId, ChannelId>, ReqResult> channelMap;
UtcTime lastUsedTime;
};
using ClientMap = std::unordered_map<ClientId, std::shared_ptr<ClientStatus>>;
Shards<ClientMap, 1024> shards_;
};
} // namespace hf3fs::storage

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,161 @@
#pragma once
#include <folly/concurrency/ConcurrentHashMap.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include "analytics/StructuredTraceLog.h"
#include "client/mgmtd/IMgmtdClientForServer.h"
#include "client/mgmtd/RoutingInfo.h"
#include "client/storage/StorageMessenger.h"
#include "common/net/Server.h"
#include "common/net/Transport.h"
#include "common/net/ib/IBSocket.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/Address.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/LockManager.h"
#include "common/utils/Semaphore.h"
#include "storage/aio/AioReadWorker.h"
#include "storage/service/BufferPool.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/ReliableUpdate.h"
#include "storage/store/StorageTargets.h"
#include "storage/update/UpdateWorker.h"
namespace hf3fs::storage {
struct Components;
class StorageOperator {
public:
class Config : public ConfigBase<Config> {
CONFIG_OBJ(write_worker, UpdateWorker::Config);
CONFIG_OBJ(event_trace_log, analytics::StructuredTraceLog<StorageEventTrace>::Config);
CONFIG_HOT_UPDATED_ITEM(max_num_results_per_query, uint32_t{100});
CONFIG_HOT_UPDATED_ITEM(batch_read_job_split_size, uint32_t{1024});
CONFIG_HOT_UPDATED_ITEM(post_buffer_per_bytes, 64_KB);
CONFIG_HOT_UPDATED_ITEM(batch_read_ignore_chain_version, false);
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_writes, 256U);
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_reads, 256U);
CONFIG_HOT_UPDATED_ITEM(read_only, false);
CONFIG_HOT_UPDATED_ITEM(rdma_transmission_req_timeout, 0_ms);
CONFIG_HOT_UPDATED_ITEM(apply_transmission_before_getting_semaphore, true);
};
StorageOperator(const Config &config, Components &components)
: config_(config),
components_(components),
updateWorker_(config_.write_worker()),
storageEventTrace_(config.event_trace_log()) {
for (const auto &ibdev : net::IBDevice::all()) {
concurrentRdmaWriteSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_writes());
concurrentRdmaReadSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_reads());
}
onConfigUpdated_ = config_.addCallbackGuard([this]() {
for (auto &[_, semaphore] : concurrentRdmaWriteSemaphore_) {
semaphore.changeUsableTokens(config_.max_concurrent_rdma_writes());
}
for (auto &[_, semaphore] : concurrentRdmaReadSemaphore_) {
semaphore.changeUsableTokens(config_.max_concurrent_rdma_reads());
}
});
}
Result<Void> init(uint32_t numberOfDisks);
Result<Void> stopAndJoin();
CoTryTask<BatchReadRsp> batchRead(ServiceRequestContext &requestCtx,
const BatchReadReq &req,
serde::CallContext &ctx);
CoTryTask<WriteRsp> write(ServiceRequestContext &requestCtx, const WriteReq &req, net::IBSocket *ibSocket);
CoTryTask<UpdateRsp> update(ServiceRequestContext &requestCtx, const UpdateReq &req, net::IBSocket *ibSocket);
CoTryTask<QueryLastChunkRsp> queryLastChunk(ServiceRequestContext &requestCtx, const QueryLastChunkReq &req);
CoTryTask<TruncateChunksRsp> truncateChunks(ServiceRequestContext &requestCtx, const TruncateChunksReq &req);
CoTryTask<RemoveChunksRsp> removeChunks(ServiceRequestContext &requestCtx, const RemoveChunksReq &req);
CoTryTask<TargetSyncInfo> syncStart(const SyncStartReq &req);
CoTryTask<SyncDoneRsp> syncDone(const SyncDoneReq &req);
CoTryTask<SpaceInfoRsp> spaceInfo(const SpaceInfoReq &req);
CoTryTask<CreateTargetRsp> createTarget(const CreateTargetReq &req);
CoTryTask<OfflineTargetRsp> offlineTarget(const OfflineTargetReq &req);
CoTryTask<RemoveTargetRsp> removeTarget(const RemoveTargetReq &req);
CoTryTask<QueryChunkRsp> queryChunk(const QueryChunkReq &req);
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(const GetAllChunkMetadataReq &req);
protected:
using ChunkMetadataProcessor = std::function<CoTryTask<void>(const ChunkId &, const ChunkMetadata &)>;
CoTask<IOResult> handleUpdate(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target);
CoTask<IOResult> doUpdate(ServiceRequestContext &requestCtx,
const UpdateIO &updateIO,
const UpdateOptions &updateOptions,
uint32_t featureFlags,
const std::shared_ptr<StorageTarget> &target,
net::IBSocket *ibSocket,
BufferPool::Buffer &buffer,
net::RDMARemoteBuf &remoteBuf,
ChunkEngineUpdateJob &chunkEngineJob,
bool allowToAllocate);
CoTask<IOResult> doCommit(ServiceRequestContext &requestCtx,
const CommitIO &commitIO,
const UpdateOptions &updateOptions,
ChunkEngineUpdateJob &chunkEngineJob,
uint32_t featureFlags,
const std::shared_ptr<StorageTarget> &target);
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> doQuery(ServiceRequestContext &requestCtx,
const VersionedChainId &vChainId,
const ChunkIdRange &chunkIdRange);
CoTryTask<uint32_t> processQueryResults(ServiceRequestContext &requestCtx,
const VersionedChainId &vChainId,
const ChunkIdRange &chunkIdRanges,
ChunkMetadataProcessor processor,
bool &moreChunksInRange);
CoTask<IOResult> doTruncate(ServiceRequestContext &requestCtx,
const TruncateChunkOp &op,
flat::UserInfo userInfo,
uint32_t featureFlags);
CoTask<IOResult> doRemove(ServiceRequestContext &requestCtx,
const RemoveChunksOp &op,
flat::UserInfo userInfo,
uint32_t featureFlags);
private:
friend class ReliableUpdate;
ConstructLog<"storage::StorageOperator"> constructLog_;
const Config &config_;
Components &components_;
UpdateWorker updateWorker_;
analytics::StructuredTraceLog<StorageEventTrace> storageEventTrace_;
std::unique_ptr<ConfigCallbackGuard> onConfigUpdated_;
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaWriteSemaphore_;
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaReadSemaphore_;
std::atomic<uint64_t> totalReadBytes_{};
std::atomic<uint64_t> totalReadIOs_{};
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,60 @@
#include "storage/service/StorageServer.h"
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/WithCancellation.h>
#include <folly/logging/xlog.h>
#include "common/kv/mem/MemKVEngine.h"
#include "common/utils/Result.h"
#include "core/service/CoreService.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/StorageService.h"
#include "stubs/common/RealStubFactory.h"
#include "stubs/mgmtd/MgmtdServiceStub.h"
namespace hf3fs::storage {
StorageServer::StorageServer(const Components::Config &config)
: net::Server(config.base()),
components_(config) {}
StorageServer::~StorageServer() {
stopAndJoin();
XLOGF(INFO, "Destructor StorageServer");
}
Result<Void> StorageServer::beforeStart() {
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<StorageService>(components_.storageOperator), true));
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<core::CoreService>()));
groups().front()->setCoroutinesPoolGetter([this](const serde::MessagePacket<> &packet) -> DynamicCoroutinesPool & {
switch (packet.serviceId) {
case StorageSerde<>::kServiceID:
return components_.getCoroutinesPool(packet.methodId);
default:
return components_.defaultPool;
}
});
RETURN_AND_LOG_ON_ERROR(components_.start(appInfo(), tpg()));
return Void{};
}
Result<Void> StorageServer::beforeStop() {
components_.reliableUpdate.beforeStop();
components_.reliableForwarding.beforeStop();
return Void{};
}
Result<Void> StorageServer::afterStop() {
RETURN_AND_LOG_ON_ERROR(components_.stopAndJoin(tpg().procThreadPool()));
return Void{};
}
hf3fs::Result<Void> StorageServer::start(const flat::AppInfo &info,
std::unique_ptr<::hf3fs::net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient) {
components_.netClient = std::move(client);
components_.mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForServer>(std::move(mgmtdClient));
return net::Server::start(info);
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,59 @@
#pragma once
#include <folly/CancellationToken.h>
#include "client/mgmtd/MgmtdClientForServer.h"
#include "common/net/Server.h"
#include "core/app/ServerAppConfig.h"
#include "core/app/ServerLauncher.h"
#include "core/app/ServerLauncherConfig.h"
#include "core/app/ServerMgmtdClientFetcher.h"
#include "storage/service/Components.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/ReliableUpdate.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::test {
struct StorageServerHelper;
}
namespace hf3fs::storage {
class StorageServer : public net::Server {
public:
static constexpr auto kName = "Storage";
static constexpr auto kNodeType = flat::NodeType::STORAGE;
using AppConfig = core::ServerAppConfig;
struct LauncherConfig : public core::ServerLauncherConfig {
LauncherConfig() { mgmtd_client() = hf3fs::client::MgmtdClientForServer::Config{}; }
};
using RemoteConfigFetcher = core::launcher::ServerMgmtdClientFetcher;
using Launcher = core::ServerLauncher<StorageServer>;
using CommonConfig = ApplicationBase::Config;
using Config = Components::Config;
StorageServer(const Components::Config &config);
~StorageServer() override;
// set up storage server.
Result<Void> beforeStart() final;
// before server stop.
Result<Void> beforeStop() final;
// tear down storage server.
Result<Void> afterStop() final;
using net::Server::start;
hf3fs::Result<Void> start(const flat::AppInfo &info,
std::unique_ptr<::hf3fs::net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient);
private:
friend struct test::StorageServerHelper;
ConstructLog<"storage::StorageServer"> constructLog_;
Components components_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,32 @@
#include "storage/service/StorageService.h"
#include "common/monitor/Recorder.h"
namespace hf3fs::storage {
namespace {
monitor::LatencyRecorder readQueueLatency{"storage.read.queue_latency"};
monitor::LatencyRecorder updateQueueLatency{"storage.update.queue_latency"};
monitor::LatencyRecorder defaultQueueLatency{"storage.default.queue_latency"};
} // namespace
void StorageService::reportReadQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
readQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
void StorageService::reportUpdateQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
updateQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
void StorageService::reportDefaultQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
defaultQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,111 @@
#pragma once
#include "common/serde/CallContext.h"
#include "fbs/storage/Service.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::storage {
class StorageService : public serde::ServiceWrapper<StorageService, storage::StorageSerde> {
public:
StorageService(StorageOperator &storageOperator)
: storageOperator_(storageOperator) {}
CoTryTask<BatchReadRsp> batchRead(serde::CallContext &ctx, const BatchReadReq &req) {
reportReadQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return BatchReadRsp{.tag = req.tag};
ServiceRequestContext requestCtx{"batchRead", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.batchRead(requestCtx, req, ctx);
}
CoTryTask<WriteRsp> write(serde::CallContext &ctx, const WriteReq &req) {
reportUpdateQueueLatency(ctx);
ServiceRequestContext requestCtx{"write", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.write(requestCtx, req, ctx.transport()->ibSocket());
}
CoTryTask<UpdateRsp> update(serde::CallContext &ctx, const UpdateReq &req) {
reportUpdateQueueLatency(ctx);
ServiceRequestContext requestCtx{"update", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.update(requestCtx, req, ctx.transport()->ibSocket());
}
CoTryTask<QueryLastChunkRsp> queryLastChunk(serde::CallContext &ctx, const QueryLastChunkReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return QueryLastChunkRsp{};
ServiceRequestContext requestCtx{"queryLastChunk", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.queryLastChunk(requestCtx, req);
}
CoTryTask<TruncateChunksRsp> truncateChunks(serde::CallContext &ctx, const TruncateChunksReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return TruncateChunksRsp{};
ServiceRequestContext requestCtx{"truncateChunks",
req.payloads.front().tag,
req.payloads.front().retryCount,
req.userInfo,
req.debugFlags};
co_return co_await storageOperator_.truncateChunks(requestCtx, req);
}
CoTryTask<RemoveChunksRsp> removeChunks(serde::CallContext &ctx, const RemoveChunksReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return RemoveChunksRsp{};
ServiceRequestContext requestCtx{"removeChunks",
req.payloads.front().tag,
req.payloads.front().retryCount,
req.userInfo,
req.debugFlags};
co_return co_await storageOperator_.removeChunks(requestCtx, req);
}
CoTryTask<TargetSyncInfo> syncStart(serde::CallContext &ctx, const SyncStartReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.syncStart(req);
}
CoTryTask<SyncDoneRsp> syncDone(serde::CallContext &ctx, const SyncDoneReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.syncDone(req);
}
CoTryTask<SpaceInfoRsp> spaceInfo(serde::CallContext &ctx, const SpaceInfoReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.spaceInfo(req);
}
CoTryTask<CreateTargetRsp> createTarget(serde::CallContext &ctx, const CreateTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.createTarget(req);
}
CoTryTask<OfflineTargetRsp> offlineTarget(serde::CallContext &ctx, const OfflineTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.offlineTarget(req);
}
CoTryTask<RemoveTargetRsp> removeTarget(serde::CallContext &ctx, const RemoveTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.removeTarget(req);
}
CoTryTask<QueryChunkRsp> queryChunk(serde::CallContext &ctx, const QueryChunkReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.queryChunk(req);
}
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(serde::CallContext &ctx, const GetAllChunkMetadataReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.getAllChunkMetadata(req);
}
private:
void reportReadQueueLatency(serde::CallContext &ctx);
void reportUpdateQueueLatency(serde::CallContext &ctx);
void reportDefaultQueueLatency(serde::CallContext &ctx);
private:
StorageOperator &storageOperator_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,418 @@
#include "storage/service/TargetMap.h"
#include <algorithm>
#include "common/monitor/Recorder.h"
#include "common/utils/RobinHood.h"
#include "fbs/mgmtd/MgmtdTypes.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder updateRoutingRecorder{"storage.update_routing"};
} // namespace
Result<net::Address> Target::getSuccessorAddr() const {
if (UNLIKELY(!successor.has_value())) {
return makeError(StorageCode::kNoSuccessorTarget);
}
auto &serviceGroups = successor->nodeInfo.app.serviceGroups;
if (UNLIKELY(serviceGroups.empty())) {
auto msg = fmt::format("target {} successor service groups is empty", *this);
XLOG(ERR, msg);
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
}
auto &endpoints = serviceGroups.front().endpoints;
if (UNLIKELY(endpoints.empty())) {
auto msg = fmt::format("target {} successor service endpoints is empty", *this);
XLOG(ERR, msg);
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
}
return endpoints.front();
}
Result<TargetId> TargetMap::getTargetId(ChainId chainId) const {
auto chainToTargetIt = chainToTarget_.find(chainId);
if (UNLIKELY(chainToTargetIt == chainToTarget_.end())) {
auto msg = fmt::format("chain {} not found", chainId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return chainToTargetIt->second;
}
Result<const Target *> TargetMap::getTarget(TargetId targetId) const {
auto targetsIt = targets_.find(targetId);
if (UNLIKELY(targetsIt == targets_.end())) {
auto msg = fmt::format("target {} not found", targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return &targetsIt->second;
}
Result<const Target *> TargetMap::getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const {
CHECK_RESULT(targetId, getTargetId(vChainId.chainId));
CHECK_RESULT(target, getTarget(targetId));
if (target->vChainId != vChainId && (!allowOutdatedChainVer || vChainId.chainVer > target->vChainId.chainVer)) {
auto msg = fmt::format("chain {} version mismatch request {} != local {}",
vChainId.chainId,
vChainId.chainVer,
target->vChainId.chainVer);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingVersionMismatch, std::move(msg));
}
if (target->localState == flat::LocalTargetState::OFFLINE) {
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kTargetOffline, std::move(msg));
}
if (target->storageTarget == nullptr) {
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
XLOG(CRITICAL, msg);
return makeError(StorageCode::kTargetOffline, std::move(msg));
}
return target;
}
Result<Void> TargetMap::addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget) {
auto targetId = storageTarget->targetId();
Target target;
target.storageTarget = storageTarget;
target.targetId = targetId;
target.chainId = storageTarget->chainId();
target.path = storageTarget->path();
target.localState = flat::LocalTargetState::ONLINE;
target.diskIndex = storageTarget->diskIndex();
target.useChunkEngine = storageTarget->useChunkEngine();
auto [it, succ] = targets_.emplace(targetId, target);
if (UNLIKELY(!succ)) {
if (it->second.localState == flat::LocalTargetState::OFFLINE) {
it->second = std::move(target);
return Void{};
}
auto msg = fmt::format("target {} already exists", targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kTargetStateInvalid, std::move(msg));
}
return Void{};
}
Result<Target *> TargetMap::getMutableTarget(TargetId targetId) {
auto targetsIt = targets_.find(targetId);
if (UNLIKELY(targetsIt == targets_.end())) {
auto msg = fmt::format("target {} not found", targetId);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return &targetsIt->second;
}
Result<Void> TargetMap::syncReceiveDone(VersionedChainId chainId) {
CHECK_RESULT(constTarget, getByChainId(chainId, false));
auto targetId = constTarget->targetId;
CHECK_RESULT(target, getMutableTarget(targetId));
XLOGF(WARNING,
"chain {} target {} sync receive done {} -> UPTODATE",
chainId,
targetId,
magic_enum::enum_name(target->localState));
target->localState = flat::LocalTargetState::UPTODATE;
return Void{};
}
Result<Void> TargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log /* = true */) {
auto recordGuard = updateRoutingRecorder.record();
if (UNLIKELY(r == nullptr)) {
XLOGF(ERR, "routing info is empty");
return makeError(StorageClientCode::kRoutingError, "routing info is empty");
}
auto &routingInfo = r->raw();
if (routingInfoVersion_ > routingInfo->routingInfoVersion) {
auto msg = fmt::format("routing info expired! {} > {}", routingInfoVersion_, routingInfo->routingInfoVersion);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
XLOGF(INFO, "routing info updated, {} -> {}", routingInfoVersion_, routingInfo->routingInfoVersion);
// 1. reset current state.
routingInfoVersion_ = routingInfo->routingInfoVersion;
chainToTarget_.clear();
syncingChains_.clear();
robin_hood::unordered_set<TargetId> headTargets;
robin_hood::unordered_set<TargetId> tailTargets;
robin_hood::unordered_set<TargetId> lastSrvTargets;
for (auto &[targetId, target] : targets_) {
if (target.isHead) {
headTargets.insert(target.targetId);
}
if (target.isTail) {
tailTargets.insert(target.targetId);
}
if (target.publicState == flat::PublicTargetState::LASTSRV) {
lastSrvTargets.insert(target.targetId);
}
target.isHead = false;
target.isTail = false;
target.vChainId = VersionedChainId{};
target.publicState = flat::PublicTargetState::INVALID;
target.successor = std::nullopt;
}
bool invalidRoutingInfo = false;
auto invalidRoutingInfoLogGuard = folly::makeGuard([&] {
if (invalidRoutingInfo) {
XLOGF(CRITICAL, "invalid routing info: {}", *routingInfo);
}
});
// 2. iterate routing info.
for (auto &[id, chain] : routingInfo->chains) {
// 3. find target in chain.
auto it = std::find_if(chain.targets.begin(), chain.targets.end(), [&](const flat::ChainTargetInfo &targetInfo) {
return bool(getMutableTarget(targetInfo.targetId));
});
if (it == chain.targets.end()) {
continue;
}
// 4. find target info.
auto targetId = it->targetId;
auto targetInfo = routingInfo->getTarget(targetId);
if (UNLIKELY(!targetInfo)) {
auto msg = fmt::format("targetInfo id {} not found", targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
// 5. update local target.
CHECK_RESULT(target, getMutableTarget(targetId));
bool targetIsServing = targetInfo->publicState == flat::PublicTargetState::SERVING ||
targetInfo->publicState == flat::PublicTargetState::SYNCING;
auto previousLocalState = target->localState;
target->isHead = (targetIsServing && it == chain.targets.begin());
target->vChainId = VersionedChainId{chain.chainId, chain.chainVersion};
if (target->storageTarget != nullptr) {
if (target->storageTarget->chainId() == ChainId{}) {
RETURN_AND_LOG_ON_ERROR(target->storageTarget->setChainId(chain.chainId));
}
if (target->storageTarget->chainId() != chain.chainId) {
auto msg = fmt::format("target.chain != routing.chain, target {}, chain {}", *target, chain);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
}
target->localState = updateLocalState(targetId, previousLocalState, targetInfo->publicState);
target->publicState = targetInfo->publicState;
auto [chainToTargetIt, succ] = chainToTarget_.emplace(chain.chainId, targetId);
if (!succ) {
auto msg = fmt::format("chain {} map to 2 targets {}, {}", chain.chainId, chainToTargetIt->second, targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
if (previousLocalState != flat::LocalTargetState::OFFLINE &&
target->localState == flat::LocalTargetState::OFFLINE) {
target->weakStorageTarget = target->storageTarget->aliveWeakPtr();
target->storageTarget = nullptr;
continue;
}
// 6. update successor.
while (targetIsServing && ++it != chain.targets.end()) {
auto targetInfo = routingInfo->getTarget(it->targetId);
if (UNLIKELY(!targetInfo)) {
auto msg = fmt::format("successor {} not found", it->targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
if (targetInfo->publicState == flat::PublicTargetState::SERVING) {
target->successor = Successor{{}, *targetInfo};
} else if (targetInfo->publicState == flat::PublicTargetState::SYNCING) {
target->successor = Successor{{}, *targetInfo};
syncingChains_.push_back(VersionedChainId{chain.chainId, chain.chainVersion});
}
if (target->successor) {
if (!targetInfo->nodeId.has_value()) {
XLOGF(WARNING, "target {} node id is nullopt", it->targetId);
break;
}
auto node = routingInfo->getNode(*targetInfo->nodeId);
if (!node) {
XLOGF(WARNING, "node {} not found", targetInfo->nodeId);
break;
}
target->successor->nodeInfo = *node;
if (UNLIKELY(target->successor->nodeInfo.app.serviceGroups.empty())) {
XLOGF(CRITICAL, "successor invalid! chain {}, successor {}, node {}", chain.chainId, *targetInfo, *node);
invalidRoutingInfo = true;
}
}
break;
}
target->isTail = (targetIsServing && !target->successor.has_value());
if (headTargets.contains(targetId) ^ target->isHead) {
if (target->isHead) {
XLOGF_IF(WARNING, log, "target {} becomes head", targetId);
} else {
XLOGF_IF(WARNING, log, "target {} is no longer head", targetId);
}
}
if (tailTargets.contains(targetId) ^ target->isTail) {
if (target->isTail) {
XLOGF_IF(WARNING, log, "target {} becomes tail", targetId);
} else {
XLOGF_IF(WARNING, log, "target {} is no longer tail", targetId);
}
}
}
for (auto &[targetId, target] : targets_) {
if (lastSrvTargets.contains(targetId) && target.storageTarget &&
(target.publicState == flat::PublicTargetState::SERVING ||
target.publicState == flat::PublicTargetState::SYNCING ||
target.publicState == flat::PublicTargetState::WAITING)) {
target.storageTarget->resetUncommitted(target.vChainId.chainVer);
}
}
recordGuard.succ();
return Void{};
}
Result<Void> TargetMap::removeTarget(TargetId targetId) {
auto succ = targets_.erase(targetId);
if (succ != 1) {
auto msg = fmt::format("target {} not found", targetId);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return Void{};
}
Result<Void> TargetMap::offlineTarget(TargetId targetId) {
CHECK_RESULT(target, getMutableTarget(targetId));
if (target->unrecoverableOffline()) {
return makeError(StorageCode::kTargetOffline, fmt::format("target is already offline, {}.", *target));
}
target->offlineUponUserRequest = true;
target->localState = flat::LocalTargetState::OFFLINE;
return Void{};
}
Result<Void> TargetMap::offlineTargets(const Path &path) {
for (auto &[targetId, target] : targets_) {
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
target.diskError = true;
target.localState = flat::LocalTargetState::OFFLINE;
XLOGF(WARNING, "offline target {} because of disk error", target.path);
}
}
return Void{};
}
Result<Void> TargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
for (auto &[targetId, target] : targets_) {
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
target.lowSpace = lowSpace;
auto old = std::exchange(target.rejectCreateChunk, rejectCreateChunk);
if (old != rejectCreateChunk) {
XLOGF(WARNING, "target {} reject create chunk {} -> {}", target.path, old, rejectCreateChunk);
}
}
}
return Void{};
}
hf3fs::flat::LocalTargetState TargetMap::updateLocalState(TargetId targetId,
hf3fs::flat::LocalTargetState localState,
hf3fs::flat::PublicTargetState publicState) {
if (localState == hf3fs::flat::LocalTargetState::UPTODATE &&
(publicState == hf3fs::flat::PublicTargetState::OFFLINE ||
publicState == hf3fs::flat::PublicTargetState::LASTSRV ||
publicState == hf3fs::flat::PublicTargetState::WAITING)) {
XLOGF(CRITICAL,
"move to offline state (shutdown), local target: {}, local state: {} -> OFFLINE, public state: {}",
targetId,
magic_enum::enum_name(localState),
magic_enum::enum_name(publicState));
return hf3fs::flat::LocalTargetState::OFFLINE;
} else if (localState == hf3fs::flat::LocalTargetState::ONLINE &&
publicState == hf3fs::flat::PublicTargetState::SERVING) {
XLOGF(INFO,
"move to up-to-date state, local target: {}, local state: {} -> UPTODATE, public state: {}",
targetId,
magic_enum::enum_name(localState),
magic_enum::enum_name(publicState));
return hf3fs::flat::LocalTargetState::UPTODATE;
}
return localState;
}
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByChainId(
VersionedChainId vChainId,
bool allowOutdatedChainVer /* = false */) const {
auto map = snapshot();
auto result = map->getByChainId(vChainId, allowOutdatedChainVer);
RETURN_ON_ERROR(result);
return std::shared_ptr<const Target>(std::move(map), *result);
}
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByTargetId(TargetId targetId) const {
auto map = snapshot();
auto result = map->getTarget(targetId);
RETURN_ON_ERROR(result);
return std::shared_ptr<const Target>(std::move(map), *result);
}
Result<Void> AtomicallyTargetMap::updateTargetMap(auto &&updateFunc) {
auto lock = std::unique_lock(mutex_);
auto map = snapshot();
while (true) {
auto newMap = map->clone();
RETURN_AND_LOG_ON_ERROR(updateFunc(newMap));
if (targetMap_.compare_exchange_strong(map, std::move(newMap))) {
break;
}
}
updateCallback_(*snapshot());
return Void{};
};
Result<Void> AtomicallyTargetMap::addStorageTarget(std::shared_ptr<StorageTarget> storageTarget) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->addStorageTarget(storageTarget); });
}
Result<Void> AtomicallyTargetMap::syncReceiveDone(VersionedChainId vChainId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->syncReceiveDone(vChainId); });
}
Result<Void> AtomicallyTargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateRouting(r); });
}
Result<Void> AtomicallyTargetMap::removeTarget(TargetId targetId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->removeTarget(targetId); });
}
Result<Void> AtomicallyTargetMap::offlineTarget(TargetId targetId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTarget(targetId); });
}
Result<Void> AtomicallyTargetMap::offlineTargets(const Path &path) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTargets(path); });
}
Result<Void> AtomicallyTargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
return updateTargetMap(
[&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateDiskState(path, lowSpace, rejectCreateChunk); });
}
void AtomicallyTargetMap::updateTargetUsedSize() {
auto lock = std::unique_lock(mutex_);
updateCallback_(*snapshot());
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,137 @@
#pragma once
#include <atomic>
#include <common/utils/RobinHood.h>
#include <folly/concurrency/AtomicSharedPtr.h>
#include <memory>
#include "client/mgmtd/RoutingInfo.h"
#include "common/serde/Serde.h"
#include "common/utils/ConstructLog.h"
#include "fbs/mgmtd/MgmtdTypes.h"
#include "fbs/mgmtd/NodeInfo.h"
#include "fbs/mgmtd/TargetInfo.h"
#include "fbs/storage/Common.h"
#include "storage/store/StorageTarget.h"
namespace hf3fs::test {
struct TargetMapHelper;
}
namespace hf3fs::storage {
class TargetMap {
public:
// [observers] clone current map.
std::shared_ptr<TargetMap> clone() const { return std::make_shared<TargetMap>(*this); }
// [observers] get target id by chain id.
Result<TargetId> getTargetId(ChainId chainId) const;
// [observers] get target by target id.
Result<const Target *> getTarget(TargetId targetId) const;
// [observers] get target by versioned chain id.
Result<const Target *> getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const;
// [observers]
auto &getTargets() const { return targets_; }
// [observers]
auto &syncingChains() const { return syncingChains_; }
// [modifiers] add a new target.
Result<Void> addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget);
// [modifiers] get target by target id.
Result<Target *> getMutableTarget(TargetId targetId);
// [modifiers] sync receive is started.
Result<Void> syncReceiveDone(VersionedChainId vChainId);
// [modifiers] update by routing info.
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log = true);
// [modifiers] set target as offline.
Result<Void> removeTarget(TargetId targetId);
// [modifiers] set target as offline.
Result<Void> offlineTarget(TargetId targetId);
// [modifiers] set targets in path as offline.
Result<Void> offlineTargets(const Path &path);
// [modifiers] reject create chunk for targets in path.
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
// update local state.
static hf3fs::flat::LocalTargetState updateLocalState(TargetId targetId,
hf3fs::flat::LocalTargetState localState,
hf3fs::flat::PublicTargetState publicState);
private:
friend struct test::TargetMapHelper;
robin_hood::unordered_map<TargetId, Target> targets_;
flat::RoutingInfoVersion routingInfoVersion_;
robin_hood::unordered_map<ChainId, TargetId> chainToTarget_;
std::vector<VersionedChainId> syncingChains_;
};
class AtomicallyTargetMap {
public:
// [observers] get a snapshot of target map.
auto snapshot() const { return targetMap_.load(); }
// [observers] get target by chain id.
Result<std::shared_ptr<const Target>> getByChainId(VersionedChainId vChainId,
bool allowOutdatedChainVer = false) const;
// [observers] get target by its id.
Result<std::shared_ptr<const Target>> getByTargetId(TargetId targetId) const;
// [modifiers] set update callback.
void setUpdateCallback(auto &&func) {
auto lock = std::unique_lock(mutex_);
updateCallback_ = std::forward<decltype(func)>(func);
}
// [modifiers] add a target.
Result<Void> addStorageTarget(std::shared_ptr<StorageTarget> storageTarget);
// [modifiers] sync receive is done.
Result<Void> syncReceiveDone(VersionedChainId vChainId);
// [modifiers] update by routing info.
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r);
// [modifiers] set target as offline.
Result<Void> removeTarget(TargetId targetId);
// [modifiers] set target as offline.
Result<Void> offlineTarget(TargetId targetId);
// [modifiers] set targets in path as offline.
Result<Void> offlineTargets(const Path &path);
// [modifiers] reject create chunk for targets in path.
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
// [modifiers] update target used size.
void updateTargetUsedSize();
// [modifiers] release target map.
auto release() { return targetMap_.exchange(nullptr); }
protected:
// [modifiers] update target map atomically.
Result<Void> updateTargetMap(auto &&updateFunc);
private:
friend struct test::TargetMapHelper;
ConstructLog<"storage::AtomicallyTargetMap"> constructLog_;
std::mutex mutex_; // for update operation.
std::function<void(const TargetMap &)> updateCallback_ = [](auto) {};
folly::atomic_shared_ptr<const TargetMap> targetMap_{std::make_shared<const TargetMap>()};
};
} // namespace hf3fs::storage