mirror of
https://github.com/deepseek-ai/3FS
synced 2025-06-26 18:16:45 +00:00
Initial commit
This commit is contained in:
183
src/storage/service/BufferPool.cc
Normal file
183
src/storage/service/BufferPool.cc
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "storage/service/BufferPool.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/experimental/coro/Collect.h>
|
||||
#include <folly/experimental/coro/Invoke.h>
|
||||
#include <folly/experimental/coro/Task.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/MagicEnum.hpp"
|
||||
#include "fbs/storage/Common.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
void alignBuffer(net::RDMABuf &rdmabuf) {
|
||||
auto address = reinterpret_cast<uint64_t>(rdmabuf.ptr());
|
||||
auto remain = address % kAIOAlignSize;
|
||||
if (remain == 0) {
|
||||
return;
|
||||
}
|
||||
auto crop = kAIOAlignSize - remain;
|
||||
rdmabuf.advance(std::min(crop, rdmabuf.size()));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<Void> BufferPool::init(CPUExecutorGroup &executor) {
|
||||
buffers_.clear();
|
||||
buffers_.reserve(UIO_MAXIOV);
|
||||
|
||||
auto smallBufferResult =
|
||||
initBuffers(executor, config_.rdmabuf_size(), config_.rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
|
||||
RETURN_AND_LOG_ON_ERROR(smallBufferResult);
|
||||
*freeIndex_.lock() = std::move(*smallBufferResult);
|
||||
|
||||
bigBufferRegisterIndexStart_ = buffers_.size();
|
||||
|
||||
auto bigBufferResult =
|
||||
initBuffers(executor, config_.big_rdmabuf_size(), config_.big_rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
|
||||
RETURN_AND_LOG_ON_ERROR(bigBufferResult);
|
||||
*bigFreeIndex_.lock() = std::move(*bigBufferResult);
|
||||
|
||||
iovecs_.clear();
|
||||
iovecs_.reserve(buffers_.size());
|
||||
for (auto &buf : buffers_) {
|
||||
iovecs_.push_back({(void *)buf.ptr(), buf.size()});
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<std::vector<BufferIndex>> BufferPool::initBuffers(CPUExecutorGroup &executor,
|
||||
Size rdmabufSize,
|
||||
uint32_t rdmabufCount,
|
||||
uint32_t limit,
|
||||
std::vector<net::RDMABuf> &outBuffers) {
|
||||
size_t totalSize = rdmabufSize * rdmabufCount;
|
||||
size_t bufferCount = std::min(limit, rdmabufCount);
|
||||
size_t smallBufferCount = (totalSize / bufferCount + rdmabufSize - 1) / rdmabufSize;
|
||||
size_t bufferSize = smallBufferCount * rdmabufSize;
|
||||
auto pool = net::RDMABufPool::create(bufferSize, bufferCount);
|
||||
|
||||
std::vector<folly::coro::TaskWithExecutor<net::RDMABuf>> tasks;
|
||||
tasks.reserve(bufferCount);
|
||||
for (auto i = 0u; i < bufferCount; ++i) {
|
||||
tasks.push_back(pool->allocate().scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
XLOGF(INFO, "allocate {} * {} RDMA buffers started", bufferCount, Size{bufferSize});
|
||||
auto buffers = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
XLOGF(INFO, "allocate {} * {} RDMA buffers finished", bufferCount, Size{bufferSize});
|
||||
|
||||
std::vector<BufferIndex> freeIndex;
|
||||
freeIndex.reserve(rdmabufCount);
|
||||
for (auto &buf : buffers) {
|
||||
if (UNLIKELY(!buf)) {
|
||||
auto msg = fmt::format("storage init buffer pool failed");
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
alignBuffer(buf);
|
||||
|
||||
BufferIndex bufferIndex;
|
||||
bufferIndex.registerIndex = outBuffers.size();
|
||||
outBuffers.push_back(buf);
|
||||
|
||||
auto split = buf;
|
||||
for (; split.size() >= rdmabufSize; split.advance(rdmabufSize)) {
|
||||
bufferIndex.buffer = split.first(rdmabufSize);
|
||||
freeIndex.push_back(bufferIndex);
|
||||
}
|
||||
}
|
||||
return Result<std::vector<BufferIndex>>(std::move(freeIndex));
|
||||
}
|
||||
|
||||
BufferPool::Buffer::~Buffer() {
|
||||
for (auto &index : indices_) {
|
||||
pool_->deallocate(index);
|
||||
}
|
||||
}
|
||||
|
||||
Result<net::RDMABuf> BufferPool::Buffer::tryAllocate(uint32_t size) {
|
||||
if (indices_.empty() || current_.size() < size) {
|
||||
if (UNLIKELY(size > pool_->rdmabufSize_)) {
|
||||
return makeError(StorageCode::kBufferSizeExceeded);
|
||||
}
|
||||
if (LIKELY(pool_->semaphore_.try_wait())) {
|
||||
auto index = pool_->allocate();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
} else {
|
||||
return makeError(RPCCode::kRDMANoBuf);
|
||||
}
|
||||
}
|
||||
auto ret = current_.takeFirst(size);
|
||||
assert(ret);
|
||||
alignBuffer(current_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
CoTryTask<net::RDMABuf> BufferPool::Buffer::allocate(uint32_t size) {
|
||||
if (indices_.empty() || current_.size() < size) {
|
||||
if (UNLIKELY(size > pool_->bigRdmabufSize_)) {
|
||||
co_return makeError(StorageCode::kBufferSizeExceeded);
|
||||
} else if (UNLIKELY(size > pool_->rdmabufSize_)) {
|
||||
co_await pool_->bigSemaphore_.co_wait();
|
||||
auto index = pool_->allocateBig();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
} else {
|
||||
co_await pool_->semaphore_.co_wait();
|
||||
auto index = pool_->allocate();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
}
|
||||
}
|
||||
auto ret = current_.takeFirst(size);
|
||||
assert(ret);
|
||||
alignBuffer(current_);
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
void BufferPool::clear(CPUExecutorGroup &executor) {
|
||||
std::vector<folly::coro::TaskWithExecutor<void>> tasks;
|
||||
tasks.reserve(buffers_.size());
|
||||
for (auto &buffer : buffers_) {
|
||||
tasks.push_back(folly::coro::co_invoke([&, buf = std::move(buffer)]() mutable -> CoTask<void> {
|
||||
buf = {};
|
||||
co_return;
|
||||
}).scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
XLOGF(INFO, "deallocate {} RDMA buffers started", buffers_.size());
|
||||
folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
XLOGF(INFO, "deallocate {} RDMA buffers finished", buffers_.size());
|
||||
}
|
||||
|
||||
BufferIndex BufferPool::allocate() {
|
||||
auto guard = freeIndex_.lock();
|
||||
assert(!guard->empty());
|
||||
auto ret = guard->back();
|
||||
guard->pop_back();
|
||||
return ret;
|
||||
}
|
||||
|
||||
BufferIndex BufferPool::allocateBig() {
|
||||
auto guard = bigFreeIndex_.lock();
|
||||
assert(!guard->empty());
|
||||
auto ret = guard->back();
|
||||
guard->pop_back();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void BufferPool::deallocate(const BufferIndex &index) {
|
||||
if (UNLIKELY(index.registerIndex >= bigBufferRegisterIndexStart_)) {
|
||||
bigFreeIndex_.lock()->push_back(index);
|
||||
bigSemaphore_.signal();
|
||||
} else {
|
||||
freeIndex_.lock()->push_back(index);
|
||||
semaphore_.signal();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
92
src/storage/service/BufferPool.h
Normal file
92
src/storage/service/BufferPool.h
Normal file
@@ -0,0 +1,92 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/Synchronized.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <folly/fibers/Semaphore.h>
|
||||
#include <limits>
|
||||
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/CPUExecutorGroup.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/ConstructLog.h"
|
||||
#include "common/utils/Size.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct BufferIndex {
|
||||
uint32_t registerIndex;
|
||||
net::RDMABuf buffer;
|
||||
};
|
||||
|
||||
class BufferPool {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(rdmabuf_size, 4_MB);
|
||||
CONFIG_ITEM(rdmabuf_count, 1024u);
|
||||
CONFIG_ITEM(big_rdmabuf_size, 64_MB);
|
||||
CONFIG_ITEM(big_rdmabuf_count, 64u);
|
||||
};
|
||||
BufferPool(const Config &config)
|
||||
: config_(config),
|
||||
rdmabufSize_(config_.rdmabuf_size()),
|
||||
semaphore_(config_.rdmabuf_count()),
|
||||
bigRdmabufSize_(config_.big_rdmabuf_size()),
|
||||
bigSemaphore_(config_.big_rdmabuf_count()) {}
|
||||
|
||||
Result<Void> init(CPUExecutorGroup &executor);
|
||||
|
||||
auto &iovecs() const { return iovecs_; }
|
||||
|
||||
class Buffer {
|
||||
public:
|
||||
explicit Buffer(BufferPool &pool)
|
||||
: pool_(&pool) {}
|
||||
Buffer(const Buffer &) = delete;
|
||||
Buffer(Buffer &&other) = default;
|
||||
Buffer &operator=(Buffer &&other) = default;
|
||||
~Buffer();
|
||||
|
||||
Result<net::RDMABuf> tryAllocate(uint32_t size);
|
||||
|
||||
CoTryTask<net::RDMABuf> allocate(uint32_t size);
|
||||
|
||||
auto index() const { return indices_.back().registerIndex; }
|
||||
|
||||
private:
|
||||
BufferPool *pool_{};
|
||||
std::vector<BufferIndex> indices_;
|
||||
net::RDMABuf current_;
|
||||
};
|
||||
auto get() { return Buffer{*this}; }
|
||||
|
||||
void clear(CPUExecutorGroup &executor);
|
||||
|
||||
protected:
|
||||
static Result<std::vector<BufferIndex>> initBuffers(CPUExecutorGroup &executor,
|
||||
Size rdmabufSize,
|
||||
uint32_t rdmabufCount,
|
||||
uint32_t limit,
|
||||
std::vector<net::RDMABuf> &outBuffers);
|
||||
|
||||
BufferIndex allocate();
|
||||
|
||||
BufferIndex allocateBig();
|
||||
|
||||
void deallocate(const BufferIndex &index);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::BufferPool"> constructLog_;
|
||||
const Config &config_;
|
||||
Size rdmabufSize_;
|
||||
std::vector<net::RDMABuf> buffers_;
|
||||
std::vector<struct iovec> iovecs_;
|
||||
folly::fibers::Semaphore semaphore_;
|
||||
folly::Synchronized<std::vector<BufferIndex>, std::mutex> freeIndex_;
|
||||
|
||||
Size bigRdmabufSize_;
|
||||
uint32_t bigBufferRegisterIndexStart_ = 0;
|
||||
folly::fibers::Semaphore bigSemaphore_;
|
||||
folly::Synchronized<std::vector<BufferIndex>, std::mutex> bigFreeIndex_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
263
src/storage/service/Components.cc
Normal file
263
src/storage/service/Components.cc
Normal file
@@ -0,0 +1,263 @@
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/LogCommands.h"
|
||||
#include "stubs/common/RealStubFactory.h"
|
||||
#include "stubs/mgmtd/MgmtdServiceStub.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
constexpr std::string_view kRoutingInfoListenerName = "Components";
|
||||
monitor::ValueRecorder targetStateRecorder{"storage.target_state", std::nullopt, false};
|
||||
|
||||
} // namespace
|
||||
|
||||
Components::Components(const Config &config)
|
||||
: config(config),
|
||||
rdmabufPool(config.buffer_pool()),
|
||||
storageTargets(config.targets(), targetMap),
|
||||
aioReadWorker(config.aio_read_worker()),
|
||||
messenger(config.forward_client()),
|
||||
resyncWorker(config.sync_worker(), *this),
|
||||
checkWorker(config.check_worker(), *this),
|
||||
dumpWorker(config.dump_worker(), *this),
|
||||
allocateWorker(config.allocate_worker(), *this),
|
||||
punchHoleWorker(*this),
|
||||
syncMetaKvWorker(config.sync_meta_kv_worker(), *this),
|
||||
reliableForwarding(config.reliable_forwarding(), *this),
|
||||
readPool(config.coroutines_pool_read(), "ReadPool"),
|
||||
updatePool(config.coroutines_pool_update(), "UpdatePool"),
|
||||
syncPool(config.coroutines_pool_default(), "SyncPool"),
|
||||
defaultPool(config.coroutines_pool_default(), "DefaultPool"),
|
||||
storageOperator(config.storage(), *this),
|
||||
reliableUpdate(config.reliable_update(), *this) {}
|
||||
|
||||
Result<Void> Components::start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg) {
|
||||
this->appInfo = appInfo;
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start rdmabufPool", rdmabufPool.init(tpg.procThreadPool()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start readPool", readPool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start updatePool", updatePool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncPool", syncPool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start defaultPool", defaultPool.start());
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start messenger", messenger.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start reliableForwarding", reliableForwarding.init());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageTargets", storageTargets.load(tpg.procThreadPool()));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
|
||||
"Start aioReadWorker",
|
||||
aioReadWorker.start(storageTargets.fds(), rdmabufPool.iovecs()));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start dumpWorker", dumpWorker.start(appInfo.nodeId));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start allocateWorker", allocateWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start punchHoleWorker", punchHoleWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncMetaKvWorker", syncMetaKvWorker.start());
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start waitRoutingInfo", waitRoutingInfo(appInfo, tpg.bgThreadPool().randomPick()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start resyncWorker", resyncWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
|
||||
"Start checkWorker",
|
||||
checkWorker.start(storageTargets.targetPaths(), storageTargets.manufacturers()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageOperator", storageOperator.init(storageTargets.targetPaths().size()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor) {
|
||||
// 1. init mgdtd client.
|
||||
if (!netClient) {
|
||||
netClient = std::make_unique<net::Client>(config.client());
|
||||
RETURN_AND_LOG_ON_ERROR(netClient->start());
|
||||
}
|
||||
if (mgmtdClient.load() == nullptr) {
|
||||
auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(
|
||||
hf3fs::stubs::ClientContextCreator{[&](net::Address addr) { return netClient->serdeCtx(addr); }});
|
||||
mgmtdClient = std::make_shared<hf3fs::client::MgmtdClientForServer>(appInfo.clusterId,
|
||||
std::move(stubFactory),
|
||||
config.mgmtd());
|
||||
}
|
||||
mgmtdClient.load()->setAppInfoForHeartbeat(appInfo);
|
||||
mgmtdClient.load()->setConfigListener(ApplicationBase::updateConfig);
|
||||
|
||||
// 2. wait target offline.
|
||||
auto currentMap = targetMap.snapshot();
|
||||
updateHeartbeatPayload(*currentMap, true);
|
||||
folly::coro::blockingWait(mgmtdClient.load()->start(&executor));
|
||||
for (auto sleep = 0;; ++sleep) {
|
||||
if (sleep) {
|
||||
XLOGF(WARNING, "Waiting for target offline in routing info...");
|
||||
std::this_thread::sleep_for(1000_ms);
|
||||
}
|
||||
|
||||
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
|
||||
auto copy = currentMap->clone();
|
||||
auto refreshResult = folly::coro::blockingWait(mgmtdClient.load()->refreshRoutingInfo(false));
|
||||
if (UNLIKELY(!refreshResult)) {
|
||||
XLOGF(ERR, "refresh routing info error {}", refreshResult.error());
|
||||
continue;
|
||||
}
|
||||
auto result = copy->updateRouting(mgmtdClient.load()->getRoutingInfo(), false);
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "get and parse routing info error {}", result.error());
|
||||
continue;
|
||||
}
|
||||
bool needWaiting = false;
|
||||
for (auto &[targetId, target] : copy->getTargets()) {
|
||||
if (target.publicState == flat::PublicTargetState::SERVING ||
|
||||
target.publicState == flat::PublicTargetState::SYNCING ||
|
||||
target.publicState == flat::PublicTargetState::WAITING) {
|
||||
XLOGF(WARNING, "waiting for chain {} target {}", targetId, serde::toJsonString(target));
|
||||
needWaiting = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!needWaiting) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. set listener.
|
||||
targetMap.setUpdateCallback([this](const TargetMap &map) { updateHeartbeatPayload(map); });
|
||||
RETURN_AND_LOG_ON_ERROR(refreshRoutingInfo());
|
||||
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
|
||||
XLOGF(INFO, "Initial target map: {}", serde::toJsonString(targetMap.snapshot()->getTargets()));
|
||||
bool succ = mgmtdClient.load()->addRoutingInfoListener(std::string{kRoutingInfoListenerName},
|
||||
[this](auto) { refreshRoutingInfo(); });
|
||||
if (UNLIKELY(!succ)) {
|
||||
auto msg = fmt::format("node {} addRoutingInfoListener failed!", appInfo.nodeId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::refreshRoutingInfo() { return targetMap.updateRouting(mgmtdClient.load()->getRoutingInfo()); }
|
||||
|
||||
Result<Void> Components::stopAndJoin(CPUExecutorGroup &executor) {
|
||||
LOG_COMMAND(INFO, "Stop aioReadWorker", aioReadWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop syncMetaKvWorker", syncMetaKvWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop punchHoleWorker", punchHoleWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop allocateWorker", allocateWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop dumpWorker", dumpWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop checkWorker", checkWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop resyncWorker", resyncWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop storageOperator", storageOperator.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop reliableForwarding", reliableForwarding.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop messenger", messenger.stopAndJoin());
|
||||
targetMap.setUpdateCallback([](auto) {});
|
||||
XLOGF(INFO, "Send offline state");
|
||||
if (auto mgmtd = mgmtdClient.load()) {
|
||||
mgmtd->removeRoutingInfoListener(kRoutingInfoListenerName);
|
||||
updateHeartbeatPayload(*targetMap.snapshot(), true);
|
||||
folly::coro::blockingWait(mgmtd->heartbeat());
|
||||
}
|
||||
LOG_COMMAND(INFO, "Stop routingStore", stopMgmtdClient());
|
||||
|
||||
LOG_COMMAND(INFO, "Stop readPool", readPool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop updatePool", updatePool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop syncPool", syncPool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop defaultPool", defaultPool.stopAndJoin());
|
||||
|
||||
auto snapshot = targetMap.release();
|
||||
std::vector<std::shared_ptr<StorageTarget>> targets;
|
||||
for (auto &[targetId, target] : snapshot->getTargets()) {
|
||||
if (target.storageTarget != nullptr) {
|
||||
targets.push_back(target.storageTarget);
|
||||
}
|
||||
}
|
||||
LOG_COMMAND(INFO, "Reset target map", snapshot.reset());
|
||||
|
||||
XLOGF(WARNING, "start to release {} targets", targets.size());
|
||||
std::atomic<uint32_t> released{};
|
||||
std::atomic<uint32_t> synced{};
|
||||
for (auto &target : targets) {
|
||||
executor.randomPick().add([&, t = std::move(target)]() mutable {
|
||||
auto result = t->release();
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", t->path(), result.error());
|
||||
} else {
|
||||
++synced;
|
||||
}
|
||||
t = nullptr;
|
||||
++released;
|
||||
});
|
||||
}
|
||||
|
||||
for (int i = 0; released != targets.size(); ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for release targets finished...");
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
|
||||
XLOGF(WARNING, "released {} targets, synced {} targets", released.load(), synced.load());
|
||||
|
||||
LOG_COMMAND(INFO, "Clear storageTargets", storageTargets.globalFileStore().clear(executor));
|
||||
LOG_COMMAND(INFO, "Clear rdmabufPool", rdmabufPool.clear(executor));
|
||||
if (config.speed_up_quit()) {
|
||||
for (auto &engine : storageTargets.engines()) {
|
||||
engine->speed_up_quit();
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::stopMgmtdClient() {
|
||||
if (mgmtdClient.load()) {
|
||||
folly::coro::blockingWait(mgmtdClient.load()->stop());
|
||||
}
|
||||
mgmtdClient.store(nullptr);
|
||||
if (netClient) {
|
||||
netClient->stopAndJoin();
|
||||
}
|
||||
netClient.reset();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<robin_hood::unordered_set<std::string>> Components::getActiveClientsList() {
|
||||
auto result = folly::coro::blockingWait(mgmtdClient.load()->listClientSessions());
|
||||
RETURN_AND_LOG_ON_ERROR(result);
|
||||
if (result->bootstrapping) {
|
||||
auto msg = fmt::format("mgmtd is bootstrapping, skip");
|
||||
XLOG(WARNING, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
robin_hood::unordered_set<std::string> activeClients;
|
||||
for (auto &client : result->sessions) {
|
||||
activeClients.emplace(std::move(client.clientId));
|
||||
}
|
||||
return Result<robin_hood::unordered_set<std::string>>(std::move(activeClients));
|
||||
}
|
||||
|
||||
void Components::triggerHeartbeatIfNeed() {
|
||||
if (triggerHeartbeatFlag.exchange(0)) {
|
||||
mgmtdClient.load()->triggerHeartbeat();
|
||||
}
|
||||
}
|
||||
|
||||
void Components::updateHeartbeatPayload(const TargetMap &targetMap, bool offline /* = false */) {
|
||||
flat::StorageHeartbeatInfo heartbeat;
|
||||
for (auto &[targetId, target] : targetMap.getTargets()) {
|
||||
flat::LocalTargetInfo targetInfo;
|
||||
targetInfo.targetId = targetId;
|
||||
targetInfo.localState = offline ? flat::LocalTargetState::OFFLINE : target.localState;
|
||||
targetInfo.diskIndex = target.diskIndex;
|
||||
targetInfo.lowSpace = target.lowSpace;
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", fmt::format("{}", targetId));
|
||||
targetStateRecorder.set(uint32_t(target.localState), tag);
|
||||
if (targetInfo.localState != flat::LocalTargetState::OFFLINE) {
|
||||
targetInfo.usedSize = target.storageTarget->usedSize();
|
||||
targetInfo.chainVersion = target.vChainId.chainVer;
|
||||
}
|
||||
heartbeat.targets.push_back(targetInfo);
|
||||
}
|
||||
mgmtdClient.load()->updateHeartbeatPayload(heartbeat);
|
||||
++triggerHeartbeatFlag;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
124
src/storage/service/Components.h
Normal file
124
src/storage/service/Components.h
Normal file
@@ -0,0 +1,124 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/concurrency/AtomicSharedPtr.h>
|
||||
|
||||
#include "client/mgmtd/MgmtdClientForServer.h"
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/DynamicCoroutinesPool.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "fbs/storage/Service.h"
|
||||
#include "storage/aio/AioReadWorker.h"
|
||||
#include "storage/service/BufferPool.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
#include "storage/sync/ResyncWorker.h"
|
||||
#include "storage/worker/AllocateWorker.h"
|
||||
#include "storage/worker/CheckWorker.h"
|
||||
#include "storage/worker/DumpWorker.h"
|
||||
#include "storage/worker/PunchHoleWorker.h"
|
||||
#include "storage/worker/SyncMetaKvWorker.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ReliableForwarding;
|
||||
|
||||
struct Components {
|
||||
struct Config : public ConfigBase<Config> {
|
||||
CONFIG_OBJ(base, net::Server::Config, [](net::Server::Config &c) {
|
||||
c.set_groups_length(2);
|
||||
c.groups(0).listener().set_listen_port(8000);
|
||||
c.groups(0).set_network_type(net::Address::RDMA);
|
||||
c.groups(0).set_services({"StorageSerde"});
|
||||
|
||||
c.groups(1).set_network_type(net::Address::TCP);
|
||||
c.groups(1).listener().set_listen_port(9000);
|
||||
c.groups(1).set_use_independent_thread_pool(true);
|
||||
c.groups(1).set_services({"Core"});
|
||||
|
||||
c.thread_pool().set_num_io_threads(32);
|
||||
c.thread_pool().set_num_proc_threads(32);
|
||||
});
|
||||
|
||||
CONFIG_OBJ(client, net::Client::Config);
|
||||
CONFIG_OBJ(mgmtd, hf3fs::client::MgmtdClientForServer::Config);
|
||||
CONFIG_OBJ(targets, StorageTargets::Config);
|
||||
CONFIG_OBJ(storage, StorageOperator::Config);
|
||||
CONFIG_OBJ(reliable_forwarding, ReliableForwarding::Config);
|
||||
CONFIG_OBJ(reliable_update, ReliableUpdate::Config);
|
||||
CONFIG_OBJ(buffer_pool, BufferPool::Config);
|
||||
CONFIG_OBJ(aio_read_worker, AioReadWorker::Config);
|
||||
CONFIG_OBJ(sync_worker, ResyncWorker::Config);
|
||||
CONFIG_OBJ(check_worker, CheckWorker::Config);
|
||||
CONFIG_OBJ(dump_worker, DumpWorker::Config);
|
||||
CONFIG_OBJ(allocate_worker, AllocateWorker::Config);
|
||||
CONFIG_OBJ(sync_meta_kv_worker, SyncMetaKvWorker::Config);
|
||||
CONFIG_OBJ(forward_client, net::Client::Config);
|
||||
CONFIG_OBJ(coroutines_pool_read, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_update, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_sync, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_default, DynamicCoroutinesPool::Config);
|
||||
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_read, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_update, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(speed_up_quit, true);
|
||||
};
|
||||
|
||||
Components(const Config &config);
|
||||
|
||||
Result<Void> start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg);
|
||||
Result<Void> waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor);
|
||||
Result<Void> refreshRoutingInfo();
|
||||
Result<Void> stopAndJoin(CPUExecutorGroup &executor);
|
||||
Result<Void> stopMgmtdClient();
|
||||
const flat::AppInfo &getAppInfo() const { return appInfo; }
|
||||
|
||||
Result<robin_hood::unordered_set<std::string>> getActiveClientsList();
|
||||
void triggerHeartbeatIfNeed();
|
||||
|
||||
inline DynamicCoroutinesPool &getCoroutinesPool(uint16_t methodId) {
|
||||
if (LIKELY(config.use_coroutines_pool_read()) && methodId == StorageSerde<>::batchReadMethodId) {
|
||||
return readPool;
|
||||
}
|
||||
if (LIKELY(config.use_coroutines_pool_update()) &&
|
||||
(methodId == StorageSerde<>::writeMethodId || methodId == StorageSerde<>::updateMethodId)) {
|
||||
return updatePool;
|
||||
}
|
||||
if (methodId == StorageSerde<>::syncStartMethodId || methodId == StorageSerde<>::getAllChunkMetadataMethodId) {
|
||||
return syncPool;
|
||||
}
|
||||
return defaultPool;
|
||||
}
|
||||
|
||||
protected:
|
||||
void updateHeartbeatPayload(const TargetMap &map, bool offline = false);
|
||||
|
||||
public:
|
||||
ConstructLog<"storage::Components"> constructLog_;
|
||||
const Config &config;
|
||||
flat::AppInfo appInfo;
|
||||
std::unique_ptr<net::Client> netClient;
|
||||
folly::atomic_shared_ptr<hf3fs::client::IMgmtdClientForServer> mgmtdClient;
|
||||
BufferPool rdmabufPool;
|
||||
AtomicallyTargetMap targetMap;
|
||||
StorageTargets storageTargets;
|
||||
AioReadWorker aioReadWorker;
|
||||
client::StorageMessenger messenger;
|
||||
ResyncWorker resyncWorker;
|
||||
CheckWorker checkWorker;
|
||||
DumpWorker dumpWorker;
|
||||
AllocateWorker allocateWorker;
|
||||
PunchHoleWorker punchHoleWorker;
|
||||
SyncMetaKvWorker syncMetaKvWorker;
|
||||
ReliableForwarding reliableForwarding;
|
||||
DynamicCoroutinesPool readPool;
|
||||
DynamicCoroutinesPool updatePool;
|
||||
DynamicCoroutinesPool syncPool;
|
||||
DynamicCoroutinesPool defaultPool;
|
||||
StorageOperator storageOperator;
|
||||
ReliableUpdate reliableUpdate;
|
||||
std::atomic<uint32_t> triggerHeartbeatFlag{};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
281
src/storage/service/ReliableForwarding.cc
Normal file
281
src/storage/service/ReliableForwarding.cc
Normal file
@@ -0,0 +1,281 @@
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
|
||||
#include <folly/experimental/coro/Sleep.h>
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/ExponentialBackoffRetry.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder reliableForwardRecorder("storage.reliable_forward");
|
||||
monitor::OperationRecorder syncingReadRecorder("storage.syncing_read");
|
||||
monitor::OperationRecorder updateRemoteRecorder("storage.update_remote");
|
||||
|
||||
monitor::CountRecorder forwardWriteBytes("storage.forward.write_bytes");
|
||||
monitor::DistributionRecorder forwardWriteDist("storage.forward.write_dist");
|
||||
monitor::CountRecorder forwardSyncingBytes("storage.forward.syncing_bytes");
|
||||
monitor::DistributionRecorder forwardSyncingDist("storage.forward.syncing_dist");
|
||||
|
||||
} // namespace
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
Result<Void> ReliableForwarding::init() { return Void{}; }
|
||||
|
||||
Result<Void> ReliableForwarding::stopAndJoin() { return Void{}; }
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::forwardWithRetry(ServiceRequestContext &requestCtx,
|
||||
const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
bool allowOutdatedChainVer /* = true */) {
|
||||
auto startTime = RelativeTime::now();
|
||||
|
||||
auto recordGuard = reliableForwardRecorder.record();
|
||||
IOResult ioResult;
|
||||
|
||||
ExponentialBackoffRetry retry(config_.retry_first_wait().asMs(),
|
||||
config_.retry_max_wait().asMs(),
|
||||
config_.retry_total_time().asMs());
|
||||
for (uint32_t retryCount = 0; !stopped_; ++retryCount) {
|
||||
auto waitTime = retry.getWaitTime();
|
||||
|
||||
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
|
||||
CO_RETURN_ON_ERROR(targetResult);
|
||||
target = std::move(*targetResult);
|
||||
|
||||
auto ioResult = co_await forward(req, retryCount, rdmabuf, chunkEngineJob, target, commitIO, waitTime);
|
||||
if (LIKELY(bool(ioResult.lengthInfo))) {
|
||||
recordGuard.succ();
|
||||
co_return ioResult;
|
||||
} else if (ioResult.lengthInfo.error().code() == StorageCode::kNoSuccessorTarget) {
|
||||
recordGuard.succ();
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
// TODO(SF): fine-grained error handling.
|
||||
auto code = ioResult.lengthInfo.error().code();
|
||||
if (!allowOutdatedChainVer && code == StorageClientCode::kRoutingVersionMismatch) {
|
||||
XLOGF(ERR,
|
||||
"forwarding routing version mismatch, req {}, result {}, elapsed {}",
|
||||
req,
|
||||
ioResult,
|
||||
(RelativeTime::now() - startTime).asMs());
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
if (waitTime.count() == 0) {
|
||||
XLOGF_IF(DFATAL,
|
||||
!requestCtx.debugFlags.faultInjectionEnabled(),
|
||||
"forwarding timeout with error, req {}, result {}",
|
||||
req,
|
||||
ioResult);
|
||||
co_return ioResult;
|
||||
} else if (code != RPCCode::kTimeout) {
|
||||
XLOGF(WARNING,
|
||||
"forwarding wait and retry, req {}, error {}, elapsed {}",
|
||||
req,
|
||||
ioResult,
|
||||
(RelativeTime::now() - startTime).asMs());
|
||||
constexpr auto checkInterval = 100ms;
|
||||
for (auto elapsed = 0ms; elapsed < waitTime && !stopped_; elapsed += checkInterval) {
|
||||
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
|
||||
CO_RETURN_ON_ERROR(targetResult);
|
||||
target = std::move(*targetResult);
|
||||
if (!target->successor.has_value()) {
|
||||
break;
|
||||
}
|
||||
co_await folly::coro::sleep(std::min(checkInterval, waitTime - elapsed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto msg = fmt::format("req is refused because of stopping, req {}", req);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
|
||||
}
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::forward(const UpdateReq &req,
|
||||
uint32_t retryCount,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
std::chrono::milliseconds timeout) {
|
||||
if (!target->successor.has_value()) {
|
||||
// use the latest chain version.
|
||||
commitIO.commitChainVer = target->vChainId.chainVer;
|
||||
co_return makeError(StorageCode::kNoSuccessorTarget);
|
||||
}
|
||||
|
||||
auto ioResult = co_await doForward(req, rdmabuf, chunkEngineJob, retryCount, *target, commitIO.isSyncing, timeout);
|
||||
if (ioResult.lengthInfo) {
|
||||
commitIO.commitVer = ioResult.commitVer;
|
||||
// use successor's chain version.
|
||||
commitIO.commitChainVer = ioResult.commitChainVer;
|
||||
|
||||
if (ioResult.commitChainVer > target->vChainId.chainVer) {
|
||||
// the remote obtains a higher chain version, and the local need to obtain the latest version by retry.
|
||||
auto msg = fmt::format("the remote obtains a higher chain version {} > current {}, req {}",
|
||||
ioResult.commitChainVer,
|
||||
target->vChainId.chainVer,
|
||||
req);
|
||||
XLOGF(WARNING, "{}", msg);
|
||||
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
}
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::doForward(const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t retryCount,
|
||||
const Target &target,
|
||||
bool &isSyncing,
|
||||
std::chrono::milliseconds timeout) {
|
||||
UpdateReq updateReq = req;
|
||||
updateReq.options.fromClient = false;
|
||||
updateReq.retryCount = retryCount;
|
||||
updateReq.payload.rdmabuf = rdmabuf;
|
||||
updateReq.payload.key.vChainId.chainVer = target.vChainId.chainVer;
|
||||
|
||||
auto buffer = components_.rdmabufPool.get();
|
||||
isSyncing = target.successor->targetInfo.publicState == hf3fs::flat::PublicTargetState::SYNCING;
|
||||
if (isSyncing) {
|
||||
updateReq.options.isSyncing = true;
|
||||
updateReq.options.commitChainVer = target.vChainId.chainVer;
|
||||
}
|
||||
|
||||
bool readForSyncing = req.payload.isWriteTruncateExtend() && isSyncing &&
|
||||
(req.options.isSyncing || req.payload.length != req.payload.chunkSize);
|
||||
if (readForSyncing) {
|
||||
auto recordGuard = syncingReadRecorder.record();
|
||||
|
||||
// read the entire chunk.
|
||||
IOResult readResult;
|
||||
auto allocateResult = buffer.tryAllocate(req.payload.chunkSize);
|
||||
if (UNLIKELY(!allocateResult)) {
|
||||
allocateResult = co_await buffer.allocate(req.payload.chunkSize);
|
||||
}
|
||||
if (UNLIKELY(!allocateResult)) {
|
||||
readResult.lengthInfo = makeError(std::move(allocateResult.error()));
|
||||
co_return readResult;
|
||||
}
|
||||
auto &readBuf = *allocateResult;
|
||||
|
||||
ReadIO payload;
|
||||
payload.key = updateReq.payload.key;
|
||||
payload.offset = 0;
|
||||
payload.length = req.payload.chunkSize;
|
||||
BatchReadJob batch(payload, target.storageTarget.get(), readResult, req.payload.checksum.type);
|
||||
batch.setRecalculateChecksum();
|
||||
batch.front().state().localbuf = readBuf;
|
||||
batch.front().state().bufferIndex = buffer.index();
|
||||
batch.front().state().readUncommitted = true;
|
||||
if (chunkEngineJob.chunk()) {
|
||||
batch.front().state().chunkEngineJob.set(nullptr, chunkEngineJob.chunk()->raw_chunk());
|
||||
}
|
||||
|
||||
co_await components_.aioReadWorker.enqueue(&batch);
|
||||
co_await batch.complete();
|
||||
CO_RETURN_ON_ERROR(readResult.lengthInfo); // OK.
|
||||
|
||||
// clear the inline data if the update is built from full chunk read
|
||||
if (BITFLAGS_CONTAIN(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE)) {
|
||||
BITFLAGS_CLEAR(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE);
|
||||
updateReq.payload.inlinebuf.data.clear();
|
||||
}
|
||||
|
||||
auto length = *readResult.lengthInfo;
|
||||
updateReq.payload.updateVer = readResult.updateVer;
|
||||
if (req.options.isSyncing) {
|
||||
updateReq.options.commitChainVer = batch.front().result().commitChainVer;
|
||||
}
|
||||
updateReq.payload.offset = 0;
|
||||
updateReq.payload.length = length;
|
||||
updateReq.payload.rdmabuf = readBuf.first(length).toRemoteBuf();
|
||||
updateReq.payload.checksum = batch.front().state().chunkChecksum;
|
||||
updateReq.payload.updateType = UpdateType::WRITE;
|
||||
|
||||
if (length <= config_.max_inline_forward_bytes()) {
|
||||
updateReq.payload.inlinebuf.data.assign(readBuf.ptr(), readBuf.ptr() + length);
|
||||
BITFLAGS_SET(updateReq.featureFlags, hf3fs::storage::FeatureFlags::SEND_DATA_INLINE);
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
} else if (isSyncing && !req.payload.isRemove() && chunkEngineJob.chunk() == nullptr) {
|
||||
auto chunkResult = target.storageTarget->queryChunk(req.payload.key.chunkId);
|
||||
if (UNLIKELY(!chunkResult)) {
|
||||
XLOGF(ERR, "forward query chunk failed, req {}, error {}", updateReq, chunkResult.error());
|
||||
co_return makeError(std::move(chunkResult.error()));
|
||||
}
|
||||
updateReq.payload.updateVer = chunkResult->updateVer;
|
||||
}
|
||||
|
||||
auto recordGuard = updateRemoteRecorder.record();
|
||||
auto addrResult = target.getSuccessorAddr();
|
||||
if (UNLIKELY(!addrResult)) {
|
||||
XLOGF(ERR, "target forward addr invalid, target {}", target);
|
||||
co_return makeError(std::move(addrResult.error()));
|
||||
}
|
||||
net::UserRequestOptions reqOptions;
|
||||
reqOptions.timeout = Duration{timeout};
|
||||
auto updateResult = co_await components_.messenger.update(*addrResult, updateReq, &reqOptions);
|
||||
if (UNLIKELY(!updateResult)) {
|
||||
XLOGF(ERR, "forward timeout, req {}, result {}", updateReq, updateResult);
|
||||
co_return makeError(std::move(updateResult.error()));
|
||||
}
|
||||
if (LIKELY(bool(updateResult->result.lengthInfo))) {
|
||||
if (target.vChainId.chainVer < updateResult->result.commitChainVer) {
|
||||
auto msg = fmt::format("chain version local < remote, req {} local {} remote {}",
|
||||
updateReq,
|
||||
target,
|
||||
updateResult->result);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
|
||||
auto length = *updateResult->result.lengthInfo;
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", fmt::format("{}", target.targetId));
|
||||
if (isSyncing) {
|
||||
updateResult->result.updateVer = req.payload.updateVer;
|
||||
forwardSyncingBytes.addSample(length, tag);
|
||||
forwardSyncingDist.addSample(length, tag);
|
||||
} else {
|
||||
forwardWriteBytes.addSample(length, tag);
|
||||
forwardWriteDist.addSample(length, tag);
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
} else {
|
||||
XLOGF(ERR, "forward failed, req {}, result {}", updateReq, updateResult->result);
|
||||
auto errorCode = updateResult->result.lengthInfo.error().code();
|
||||
if (errorCode == StorageCode::kChecksumMismatch) {
|
||||
auto reqChecksum = updateReq.payload.checksum;
|
||||
auto realChecksum = ChecksumInfo::create(reqChecksum.type,
|
||||
(const uint8_t *)updateReq.payload.rdmabuf.addr(),
|
||||
updateReq.payload.length);
|
||||
if (reqChecksum != realChecksum) {
|
||||
XLOGF(DFATAL,
|
||||
"local rdma buffer is corrupted local {} != client {}, req: {}, kill self...",
|
||||
realChecksum,
|
||||
reqChecksum,
|
||||
req);
|
||||
ApplicationBase::handleSignal(SIGUSR2);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return updateResult->result;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
65
src/storage/service/ReliableForwarding.h
Normal file
65
src/storage/service/ReliableForwarding.h
Normal file
@@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/net/Client.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
struct Target;
|
||||
|
||||
class ReliableForwarding {
|
||||
public:
|
||||
struct Config : ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_first_wait, 100_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_max_wait, 1000_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_total_time, 60_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_inline_forward_bytes, Size{});
|
||||
};
|
||||
|
||||
ReliableForwarding(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components) {}
|
||||
|
||||
Result<Void> init();
|
||||
|
||||
void beforeStop() { stopped_ = true; }
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
CoTask<IOResult> forwardWithRetry(ServiceRequestContext &requestCtx,
|
||||
const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
bool allowOutdatedChainVer = true);
|
||||
|
||||
CoTask<IOResult> forward(const UpdateReq &req,
|
||||
uint32_t retryCount,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
std::chrono::milliseconds timeout);
|
||||
|
||||
CoTask<IOResult> doForward(const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t retryCount,
|
||||
const Target &target,
|
||||
bool &isSyncing,
|
||||
std::chrono::milliseconds timeout);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::ReliableForwarding"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
158
src/storage/service/ReliableUpdate.cc
Normal file
158
src/storage/service/ReliableUpdate.cc
Normal file
@@ -0,0 +1,158 @@
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::OperationRecorder reliableUpdateRecorder{"storage.reliable_update"};
|
||||
monitor::CountRecorder reliableUpdateWaited{"storage.reliable_update.waited"};
|
||||
monitor::CountRecorder reliableUpdateDuplidate{"storage.reliable_update.duplicate"};
|
||||
monitor::CountRecorder reliableUpdateCached{"storage.reliable_update.cached"};
|
||||
monitor::OperationRecorder waitChannelLockRecorder{"storage.wait_channel_lock"};
|
||||
|
||||
CoTask<IOResult> ReliableUpdate::update(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target) {
|
||||
XLOGF(DBG1, "Start reliable update, tag: {}, req: {}", req.tag, req);
|
||||
|
||||
if (UNLIKELY(stopped_)) {
|
||||
auto msg = fmt::format("req is refused because of stopping, req {}", req);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
|
||||
}
|
||||
|
||||
// 1. check if channel id is valid.
|
||||
if (req.tag.channel.id == ChannelId{0}) {
|
||||
XLOGF(DFATAL,
|
||||
"{} request has invalid message tag {}: {}",
|
||||
magic_enum::enum_name(req.payload.updateType),
|
||||
req.tag,
|
||||
req);
|
||||
co_return makeError(StorageClientCode::kFoundBug);
|
||||
}
|
||||
|
||||
// 2. get cached.
|
||||
auto clientId = req.tag.clientId;
|
||||
auto reqResult = shards_.withLock(
|
||||
[&](ClientMap &map) {
|
||||
auto &clientStatus = map[clientId];
|
||||
if (clientStatus == nullptr) {
|
||||
clientStatus = std::make_shared<ClientStatus>();
|
||||
}
|
||||
auto key = std::pair<ChainId, ChannelId>(req.payload.key.vChainId.chainId, req.tag.channel.id);
|
||||
auto &reqResult = clientStatus->channelMap[key];
|
||||
clientStatus->lastUsedTime = UtcClock::now();
|
||||
return std::shared_ptr<ReqResult>(clientStatus, &reqResult);
|
||||
},
|
||||
clientId);
|
||||
|
||||
// 3. lock channel.
|
||||
auto lockRecordGuard = waitChannelLockRecorder.record();
|
||||
folly::coro::Baton baton;
|
||||
auto lock = target->storageTarget->tryLockChannel(baton, fmt::format("{}:{}", clientId, req.tag.channel.id));
|
||||
if (!lock.locked()) {
|
||||
reliableUpdateWaited.addSample(1);
|
||||
XLOGF(ERR, "Channel is locked, need retry, tag: {}, req: {}", req.tag, req);
|
||||
co_return makeError(StorageCode::kChannelIsLocked);
|
||||
}
|
||||
lockRecordGuard.report(true);
|
||||
|
||||
IOResult updateResult;
|
||||
if (req.tag.channel.seqnum < reqResult->channelSeqnum) {
|
||||
reliableUpdateDuplidate.addSample(1);
|
||||
XLOGF(WARN, "Find a duplicate update, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
co_return makeError(StorageClientCode::kDuplicateUpdate);
|
||||
}
|
||||
|
||||
// 4. return cached result.
|
||||
if (req.tag.channel.seqnum == reqResult->channelSeqnum &&
|
||||
target->storageTarget->generationId() == reqResult->generationId) {
|
||||
if (req.tag.requestId != reqResult->requestId) {
|
||||
XLOGF(DFATAL,
|
||||
"[BUG] Message tag {} is already assigned to another update, cached result: {}, req: {}",
|
||||
req.tag,
|
||||
*reqResult,
|
||||
req);
|
||||
co_return makeError(StorageClientCode::kFoundBug);
|
||||
}
|
||||
|
||||
if (reqResult->updateResult.lengthInfo.hasValue()) {
|
||||
if (req.payload.updateVer == 0 || req.payload.updateVer == reqResult->updateResult.updateVer) {
|
||||
updateResult = reqResult->updateResult;
|
||||
|
||||
if (*updateResult.lengthInfo != req.payload.length && !req.payload.isExtend()) {
|
||||
updateResult.lengthInfo = req.payload.length;
|
||||
XLOGF(WARN,
|
||||
"Cached length info {} not equal to write size in request {}, fixed update result: {}",
|
||||
reqResult->updateResult.lengthInfo,
|
||||
req,
|
||||
updateResult);
|
||||
}
|
||||
|
||||
reliableUpdateCached.addSample(1);
|
||||
XLOGF(DBG1, "Return cached update result, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
co_return updateResult;
|
||||
} else {
|
||||
XLOGF(CRITICAL,
|
||||
"Cached update version not equal to request update version, req:{}, cached result: {}",
|
||||
req,
|
||||
*reqResult);
|
||||
}
|
||||
} else if (req.payload.updateVer == 0 && !target->storageTarget->useChunkEngine() &&
|
||||
reqResult->succUpdateVer != 0) {
|
||||
XLOGF(CRITICAL, "Pick up previous update version, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
req.payload.updateVer = reqResult->succUpdateVer;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. start a new task.
|
||||
auto recordGuard = reliableUpdateRecorder.record();
|
||||
updateResult = co_await components_.storageOperator.handleUpdate(requestCtx, req, ibSocket, target);
|
||||
if (LIKELY(bool(updateResult.lengthInfo))) {
|
||||
recordGuard.succ();
|
||||
}
|
||||
|
||||
*reqResult = {req.tag.channel.seqnum,
|
||||
req.tag.requestId,
|
||||
updateResult,
|
||||
req.payload.updateVer,
|
||||
target->storageTarget->generationId()};
|
||||
|
||||
XLOGF(DBG1, "Completed reliable update, tag: {}, result: {}", req.tag, *reqResult);
|
||||
co_return updateResult;
|
||||
}
|
||||
|
||||
Result<Void> ReliableUpdate::cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients) {
|
||||
if (!config_.clean_up_expired_clients()) {
|
||||
return Void{};
|
||||
}
|
||||
if (activeClients.empty()) {
|
||||
XLOGF(ERR, "activeClients is empty!");
|
||||
return Void{};
|
||||
}
|
||||
auto allZero = ClientId::zero();
|
||||
std::size_t cleanUpClientCount = 0;
|
||||
shards_.iterate([&](ClientMap &map) {
|
||||
auto now = UtcClock::now();
|
||||
auto expiredClientsTimeout = config_.expired_clients_timeout();
|
||||
for (auto it = map.begin(); it != map.end();) {
|
||||
const auto &[clientId, clientStatus] = *it;
|
||||
if (!activeClients.contains(clientId.uuid.toHexString()) && clientId != allZero &&
|
||||
now >= clientStatus->lastUsedTime + expiredClientsTimeout) {
|
||||
XLOGF(WARNING, "clean up expired client {}, last used time: {}", clientId, clientStatus->lastUsedTime);
|
||||
it = map.erase(it);
|
||||
++cleanUpClientCount;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
});
|
||||
XLOGF(WARNING, "clean up {} expired clients", cleanUpClientCount);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
61
src/storage/service/ReliableUpdate.h
Normal file
61
src/storage/service/ReliableUpdate.h
Normal file
@@ -0,0 +1,61 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/net/Transport.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "common/utils/Shards.h"
|
||||
#include "common/utils/Size.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
class StorageOperator;
|
||||
|
||||
class ReliableUpdate {
|
||||
public:
|
||||
struct Config : ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(clean_up_expired_clients, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(expired_clients_timeout, 1_h);
|
||||
};
|
||||
ReliableUpdate(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components) {}
|
||||
|
||||
CoTask<IOResult> update(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target);
|
||||
|
||||
Result<Void> cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients);
|
||||
|
||||
void beforeStop() { stopped_ = true; }
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::ReliableUpdate"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
folly::coro::Mutex mutex_;
|
||||
|
||||
struct ReqResult {
|
||||
SERDE_STRUCT_FIELD(channelSeqnum, ChannelSeqNum{0});
|
||||
SERDE_STRUCT_FIELD(requestId, RequestId{0});
|
||||
SERDE_STRUCT_FIELD(updateResult, IOResult{});
|
||||
SERDE_STRUCT_FIELD(succUpdateVer, ChunkVer{});
|
||||
SERDE_STRUCT_FIELD(generationId, uint32_t{});
|
||||
};
|
||||
|
||||
struct ClientStatus {
|
||||
std::unordered_map<std::pair<ChainId, ChannelId>, ReqResult> channelMap;
|
||||
UtcTime lastUsedTime;
|
||||
};
|
||||
using ClientMap = std::unordered_map<ClientId, std::shared_ptr<ClientStatus>>;
|
||||
Shards<ClientMap, 1024> shards_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
1203
src/storage/service/StorageOperator.cc
Normal file
1203
src/storage/service/StorageOperator.cc
Normal file
File diff suppressed because it is too large
Load Diff
161
src/storage/service/StorageOperator.h
Normal file
161
src/storage/service/StorageOperator.h
Normal file
@@ -0,0 +1,161 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/concurrency/ConcurrentHashMap.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
|
||||
#include "analytics/StructuredTraceLog.h"
|
||||
#include "client/mgmtd/IMgmtdClientForServer.h"
|
||||
#include "client/mgmtd/RoutingInfo.h"
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/net/Server.h"
|
||||
#include "common/net/Transport.h"
|
||||
#include "common/net/ib/IBSocket.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/Address.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/Semaphore.h"
|
||||
#include "storage/aio/AioReadWorker.h"
|
||||
#include "storage/service/BufferPool.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
#include "storage/update/UpdateWorker.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
|
||||
class StorageOperator {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_OBJ(write_worker, UpdateWorker::Config);
|
||||
CONFIG_OBJ(event_trace_log, analytics::StructuredTraceLog<StorageEventTrace>::Config);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_num_results_per_query, uint32_t{100});
|
||||
CONFIG_HOT_UPDATED_ITEM(batch_read_job_split_size, uint32_t{1024});
|
||||
CONFIG_HOT_UPDATED_ITEM(post_buffer_per_bytes, 64_KB);
|
||||
CONFIG_HOT_UPDATED_ITEM(batch_read_ignore_chain_version, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_writes, 256U);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_reads, 256U);
|
||||
CONFIG_HOT_UPDATED_ITEM(read_only, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(rdma_transmission_req_timeout, 0_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(apply_transmission_before_getting_semaphore, true);
|
||||
};
|
||||
|
||||
StorageOperator(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
updateWorker_(config_.write_worker()),
|
||||
storageEventTrace_(config.event_trace_log()) {
|
||||
for (const auto &ibdev : net::IBDevice::all()) {
|
||||
concurrentRdmaWriteSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_writes());
|
||||
concurrentRdmaReadSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_reads());
|
||||
}
|
||||
|
||||
onConfigUpdated_ = config_.addCallbackGuard([this]() {
|
||||
for (auto &[_, semaphore] : concurrentRdmaWriteSemaphore_) {
|
||||
semaphore.changeUsableTokens(config_.max_concurrent_rdma_writes());
|
||||
}
|
||||
for (auto &[_, semaphore] : concurrentRdmaReadSemaphore_) {
|
||||
semaphore.changeUsableTokens(config_.max_concurrent_rdma_reads());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Result<Void> init(uint32_t numberOfDisks);
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
CoTryTask<BatchReadRsp> batchRead(ServiceRequestContext &requestCtx,
|
||||
const BatchReadReq &req,
|
||||
serde::CallContext &ctx);
|
||||
|
||||
CoTryTask<WriteRsp> write(ServiceRequestContext &requestCtx, const WriteReq &req, net::IBSocket *ibSocket);
|
||||
|
||||
CoTryTask<UpdateRsp> update(ServiceRequestContext &requestCtx, const UpdateReq &req, net::IBSocket *ibSocket);
|
||||
|
||||
CoTryTask<QueryLastChunkRsp> queryLastChunk(ServiceRequestContext &requestCtx, const QueryLastChunkReq &req);
|
||||
|
||||
CoTryTask<TruncateChunksRsp> truncateChunks(ServiceRequestContext &requestCtx, const TruncateChunksReq &req);
|
||||
|
||||
CoTryTask<RemoveChunksRsp> removeChunks(ServiceRequestContext &requestCtx, const RemoveChunksReq &req);
|
||||
|
||||
CoTryTask<TargetSyncInfo> syncStart(const SyncStartReq &req);
|
||||
|
||||
CoTryTask<SyncDoneRsp> syncDone(const SyncDoneReq &req);
|
||||
|
||||
CoTryTask<SpaceInfoRsp> spaceInfo(const SpaceInfoReq &req);
|
||||
|
||||
CoTryTask<CreateTargetRsp> createTarget(const CreateTargetReq &req);
|
||||
|
||||
CoTryTask<OfflineTargetRsp> offlineTarget(const OfflineTargetReq &req);
|
||||
|
||||
CoTryTask<RemoveTargetRsp> removeTarget(const RemoveTargetReq &req);
|
||||
|
||||
CoTryTask<QueryChunkRsp> queryChunk(const QueryChunkReq &req);
|
||||
|
||||
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(const GetAllChunkMetadataReq &req);
|
||||
|
||||
protected:
|
||||
using ChunkMetadataProcessor = std::function<CoTryTask<void>(const ChunkId &, const ChunkMetadata &)>;
|
||||
|
||||
CoTask<IOResult> handleUpdate(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target);
|
||||
|
||||
CoTask<IOResult> doUpdate(ServiceRequestContext &requestCtx,
|
||||
const UpdateIO &updateIO,
|
||||
const UpdateOptions &updateOptions,
|
||||
uint32_t featureFlags,
|
||||
const std::shared_ptr<StorageTarget> &target,
|
||||
net::IBSocket *ibSocket,
|
||||
BufferPool::Buffer &buffer,
|
||||
net::RDMARemoteBuf &remoteBuf,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
bool allowToAllocate);
|
||||
|
||||
CoTask<IOResult> doCommit(ServiceRequestContext &requestCtx,
|
||||
const CommitIO &commitIO,
|
||||
const UpdateOptions &updateOptions,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t featureFlags,
|
||||
const std::shared_ptr<StorageTarget> &target);
|
||||
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> doQuery(ServiceRequestContext &requestCtx,
|
||||
const VersionedChainId &vChainId,
|
||||
const ChunkIdRange &chunkIdRange);
|
||||
|
||||
CoTryTask<uint32_t> processQueryResults(ServiceRequestContext &requestCtx,
|
||||
const VersionedChainId &vChainId,
|
||||
const ChunkIdRange &chunkIdRanges,
|
||||
ChunkMetadataProcessor processor,
|
||||
bool &moreChunksInRange);
|
||||
|
||||
CoTask<IOResult> doTruncate(ServiceRequestContext &requestCtx,
|
||||
const TruncateChunkOp &op,
|
||||
flat::UserInfo userInfo,
|
||||
uint32_t featureFlags);
|
||||
|
||||
CoTask<IOResult> doRemove(ServiceRequestContext &requestCtx,
|
||||
const RemoveChunksOp &op,
|
||||
flat::UserInfo userInfo,
|
||||
uint32_t featureFlags);
|
||||
|
||||
private:
|
||||
friend class ReliableUpdate;
|
||||
|
||||
ConstructLog<"storage::StorageOperator"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
UpdateWorker updateWorker_;
|
||||
analytics::StructuredTraceLog<StorageEventTrace> storageEventTrace_;
|
||||
std::unique_ptr<ConfigCallbackGuard> onConfigUpdated_;
|
||||
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaWriteSemaphore_;
|
||||
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaReadSemaphore_;
|
||||
std::atomic<uint64_t> totalReadBytes_{};
|
||||
std::atomic<uint64_t> totalReadIOs_{};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
60
src/storage/service/StorageServer.cc
Normal file
60
src/storage/service/StorageServer.cc
Normal file
@@ -0,0 +1,60 @@
|
||||
#include "storage/service/StorageServer.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/experimental/coro/WithCancellation.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "common/kv/mem/MemKVEngine.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "core/service/CoreService.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/StorageService.h"
|
||||
#include "stubs/common/RealStubFactory.h"
|
||||
#include "stubs/mgmtd/MgmtdServiceStub.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
StorageServer::StorageServer(const Components::Config &config)
|
||||
: net::Server(config.base()),
|
||||
components_(config) {}
|
||||
|
||||
StorageServer::~StorageServer() {
|
||||
stopAndJoin();
|
||||
XLOGF(INFO, "Destructor StorageServer");
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::beforeStart() {
|
||||
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<StorageService>(components_.storageOperator), true));
|
||||
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<core::CoreService>()));
|
||||
groups().front()->setCoroutinesPoolGetter([this](const serde::MessagePacket<> &packet) -> DynamicCoroutinesPool & {
|
||||
switch (packet.serviceId) {
|
||||
case StorageSerde<>::kServiceID:
|
||||
return components_.getCoroutinesPool(packet.methodId);
|
||||
default:
|
||||
return components_.defaultPool;
|
||||
}
|
||||
});
|
||||
RETURN_AND_LOG_ON_ERROR(components_.start(appInfo(), tpg()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::beforeStop() {
|
||||
components_.reliableUpdate.beforeStop();
|
||||
components_.reliableForwarding.beforeStop();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::afterStop() {
|
||||
RETURN_AND_LOG_ON_ERROR(components_.stopAndJoin(tpg().procThreadPool()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
hf3fs::Result<Void> StorageServer::start(const flat::AppInfo &info,
|
||||
std::unique_ptr<::hf3fs::net::Client> client,
|
||||
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient) {
|
||||
components_.netClient = std::move(client);
|
||||
components_.mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForServer>(std::move(mgmtdClient));
|
||||
return net::Server::start(info);
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
59
src/storage/service/StorageServer.h
Normal file
59
src/storage/service/StorageServer.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/CancellationToken.h>
|
||||
|
||||
#include "client/mgmtd/MgmtdClientForServer.h"
|
||||
#include "common/net/Server.h"
|
||||
#include "core/app/ServerAppConfig.h"
|
||||
#include "core/app/ServerLauncher.h"
|
||||
#include "core/app/ServerLauncherConfig.h"
|
||||
#include "core/app/ServerMgmtdClientFetcher.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::test {
|
||||
struct StorageServerHelper;
|
||||
}
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageServer : public net::Server {
|
||||
public:
|
||||
static constexpr auto kName = "Storage";
|
||||
static constexpr auto kNodeType = flat::NodeType::STORAGE;
|
||||
|
||||
using AppConfig = core::ServerAppConfig;
|
||||
struct LauncherConfig : public core::ServerLauncherConfig {
|
||||
LauncherConfig() { mgmtd_client() = hf3fs::client::MgmtdClientForServer::Config{}; }
|
||||
};
|
||||
using RemoteConfigFetcher = core::launcher::ServerMgmtdClientFetcher;
|
||||
using Launcher = core::ServerLauncher<StorageServer>;
|
||||
|
||||
using CommonConfig = ApplicationBase::Config;
|
||||
using Config = Components::Config;
|
||||
StorageServer(const Components::Config &config);
|
||||
~StorageServer() override;
|
||||
|
||||
// set up storage server.
|
||||
Result<Void> beforeStart() final;
|
||||
|
||||
// before server stop.
|
||||
Result<Void> beforeStop() final;
|
||||
|
||||
// tear down storage server.
|
||||
Result<Void> afterStop() final;
|
||||
|
||||
using net::Server::start;
|
||||
hf3fs::Result<Void> start(const flat::AppInfo &info,
|
||||
std::unique_ptr<::hf3fs::net::Client> client,
|
||||
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient);
|
||||
|
||||
private:
|
||||
friend struct test::StorageServerHelper;
|
||||
ConstructLog<"storage::StorageServer"> constructLog_;
|
||||
Components components_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
32
src/storage/service/StorageService.cc
Normal file
32
src/storage/service/StorageService.cc
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "storage/service/StorageService.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::LatencyRecorder readQueueLatency{"storage.read.queue_latency"};
|
||||
monitor::LatencyRecorder updateQueueLatency{"storage.update.queue_latency"};
|
||||
monitor::LatencyRecorder defaultQueueLatency{"storage.default.queue_latency"};
|
||||
|
||||
} // namespace
|
||||
|
||||
void StorageService::reportReadQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
readQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
void StorageService::reportUpdateQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
updateQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
void StorageService::reportDefaultQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
defaultQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
111
src/storage/service/StorageService.h
Normal file
111
src/storage/service/StorageService.h
Normal file
@@ -0,0 +1,111 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/serde/CallContext.h"
|
||||
#include "fbs/storage/Service.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageService : public serde::ServiceWrapper<StorageService, storage::StorageSerde> {
|
||||
public:
|
||||
StorageService(StorageOperator &storageOperator)
|
||||
: storageOperator_(storageOperator) {}
|
||||
|
||||
CoTryTask<BatchReadRsp> batchRead(serde::CallContext &ctx, const BatchReadReq &req) {
|
||||
reportReadQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return BatchReadRsp{.tag = req.tag};
|
||||
ServiceRequestContext requestCtx{"batchRead", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.batchRead(requestCtx, req, ctx);
|
||||
}
|
||||
|
||||
CoTryTask<WriteRsp> write(serde::CallContext &ctx, const WriteReq &req) {
|
||||
reportUpdateQueueLatency(ctx);
|
||||
ServiceRequestContext requestCtx{"write", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.write(requestCtx, req, ctx.transport()->ibSocket());
|
||||
}
|
||||
|
||||
CoTryTask<UpdateRsp> update(serde::CallContext &ctx, const UpdateReq &req) {
|
||||
reportUpdateQueueLatency(ctx);
|
||||
ServiceRequestContext requestCtx{"update", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.update(requestCtx, req, ctx.transport()->ibSocket());
|
||||
}
|
||||
|
||||
CoTryTask<QueryLastChunkRsp> queryLastChunk(serde::CallContext &ctx, const QueryLastChunkReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return QueryLastChunkRsp{};
|
||||
ServiceRequestContext requestCtx{"queryLastChunk", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.queryLastChunk(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<TruncateChunksRsp> truncateChunks(serde::CallContext &ctx, const TruncateChunksReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return TruncateChunksRsp{};
|
||||
ServiceRequestContext requestCtx{"truncateChunks",
|
||||
req.payloads.front().tag,
|
||||
req.payloads.front().retryCount,
|
||||
req.userInfo,
|
||||
req.debugFlags};
|
||||
co_return co_await storageOperator_.truncateChunks(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<RemoveChunksRsp> removeChunks(serde::CallContext &ctx, const RemoveChunksReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return RemoveChunksRsp{};
|
||||
ServiceRequestContext requestCtx{"removeChunks",
|
||||
req.payloads.front().tag,
|
||||
req.payloads.front().retryCount,
|
||||
req.userInfo,
|
||||
req.debugFlags};
|
||||
co_return co_await storageOperator_.removeChunks(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<TargetSyncInfo> syncStart(serde::CallContext &ctx, const SyncStartReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.syncStart(req);
|
||||
}
|
||||
|
||||
CoTryTask<SyncDoneRsp> syncDone(serde::CallContext &ctx, const SyncDoneReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.syncDone(req);
|
||||
}
|
||||
|
||||
CoTryTask<SpaceInfoRsp> spaceInfo(serde::CallContext &ctx, const SpaceInfoReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.spaceInfo(req);
|
||||
}
|
||||
|
||||
CoTryTask<CreateTargetRsp> createTarget(serde::CallContext &ctx, const CreateTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.createTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<OfflineTargetRsp> offlineTarget(serde::CallContext &ctx, const OfflineTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.offlineTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<RemoveTargetRsp> removeTarget(serde::CallContext &ctx, const RemoveTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.removeTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<QueryChunkRsp> queryChunk(serde::CallContext &ctx, const QueryChunkReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.queryChunk(req);
|
||||
}
|
||||
|
||||
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(serde::CallContext &ctx, const GetAllChunkMetadataReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.getAllChunkMetadata(req);
|
||||
}
|
||||
|
||||
private:
|
||||
void reportReadQueueLatency(serde::CallContext &ctx);
|
||||
void reportUpdateQueueLatency(serde::CallContext &ctx);
|
||||
void reportDefaultQueueLatency(serde::CallContext &ctx);
|
||||
|
||||
private:
|
||||
StorageOperator &storageOperator_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
418
src/storage/service/TargetMap.cc
Normal file
418
src/storage/service/TargetMap.cc
Normal file
@@ -0,0 +1,418 @@
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder updateRoutingRecorder{"storage.update_routing"};
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<net::Address> Target::getSuccessorAddr() const {
|
||||
if (UNLIKELY(!successor.has_value())) {
|
||||
return makeError(StorageCode::kNoSuccessorTarget);
|
||||
}
|
||||
auto &serviceGroups = successor->nodeInfo.app.serviceGroups;
|
||||
if (UNLIKELY(serviceGroups.empty())) {
|
||||
auto msg = fmt::format("target {} successor service groups is empty", *this);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
|
||||
}
|
||||
auto &endpoints = serviceGroups.front().endpoints;
|
||||
if (UNLIKELY(endpoints.empty())) {
|
||||
auto msg = fmt::format("target {} successor service endpoints is empty", *this);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
|
||||
}
|
||||
return endpoints.front();
|
||||
}
|
||||
|
||||
Result<TargetId> TargetMap::getTargetId(ChainId chainId) const {
|
||||
auto chainToTargetIt = chainToTarget_.find(chainId);
|
||||
if (UNLIKELY(chainToTargetIt == chainToTarget_.end())) {
|
||||
auto msg = fmt::format("chain {} not found", chainId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return chainToTargetIt->second;
|
||||
}
|
||||
|
||||
Result<const Target *> TargetMap::getTarget(TargetId targetId) const {
|
||||
auto targetsIt = targets_.find(targetId);
|
||||
if (UNLIKELY(targetsIt == targets_.end())) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return &targetsIt->second;
|
||||
}
|
||||
|
||||
Result<const Target *> TargetMap::getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const {
|
||||
CHECK_RESULT(targetId, getTargetId(vChainId.chainId));
|
||||
CHECK_RESULT(target, getTarget(targetId));
|
||||
if (target->vChainId != vChainId && (!allowOutdatedChainVer || vChainId.chainVer > target->vChainId.chainVer)) {
|
||||
auto msg = fmt::format("chain {} version mismatch request {} != local {}",
|
||||
vChainId.chainId,
|
||||
vChainId.chainVer,
|
||||
target->vChainId.chainVer);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingVersionMismatch, std::move(msg));
|
||||
}
|
||||
if (target->localState == flat::LocalTargetState::OFFLINE) {
|
||||
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kTargetOffline, std::move(msg));
|
||||
}
|
||||
if (target->storageTarget == nullptr) {
|
||||
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
|
||||
XLOG(CRITICAL, msg);
|
||||
return makeError(StorageCode::kTargetOffline, std::move(msg));
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget) {
|
||||
auto targetId = storageTarget->targetId();
|
||||
Target target;
|
||||
target.storageTarget = storageTarget;
|
||||
target.targetId = targetId;
|
||||
target.chainId = storageTarget->chainId();
|
||||
target.path = storageTarget->path();
|
||||
target.localState = flat::LocalTargetState::ONLINE;
|
||||
target.diskIndex = storageTarget->diskIndex();
|
||||
target.useChunkEngine = storageTarget->useChunkEngine();
|
||||
auto [it, succ] = targets_.emplace(targetId, target);
|
||||
if (UNLIKELY(!succ)) {
|
||||
if (it->second.localState == flat::LocalTargetState::OFFLINE) {
|
||||
it->second = std::move(target);
|
||||
return Void{};
|
||||
}
|
||||
auto msg = fmt::format("target {} already exists", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kTargetStateInvalid, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Target *> TargetMap::getMutableTarget(TargetId targetId) {
|
||||
auto targetsIt = targets_.find(targetId);
|
||||
if (UNLIKELY(targetsIt == targets_.end())) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return &targetsIt->second;
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::syncReceiveDone(VersionedChainId chainId) {
|
||||
CHECK_RESULT(constTarget, getByChainId(chainId, false));
|
||||
auto targetId = constTarget->targetId;
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
XLOGF(WARNING,
|
||||
"chain {} target {} sync receive done {} -> UPTODATE",
|
||||
chainId,
|
||||
targetId,
|
||||
magic_enum::enum_name(target->localState));
|
||||
target->localState = flat::LocalTargetState::UPTODATE;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log /* = true */) {
|
||||
auto recordGuard = updateRoutingRecorder.record();
|
||||
|
||||
if (UNLIKELY(r == nullptr)) {
|
||||
XLOGF(ERR, "routing info is empty");
|
||||
return makeError(StorageClientCode::kRoutingError, "routing info is empty");
|
||||
}
|
||||
auto &routingInfo = r->raw();
|
||||
if (routingInfoVersion_ > routingInfo->routingInfoVersion) {
|
||||
auto msg = fmt::format("routing info expired! {} > {}", routingInfoVersion_, routingInfo->routingInfoVersion);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
XLOGF(INFO, "routing info updated, {} -> {}", routingInfoVersion_, routingInfo->routingInfoVersion);
|
||||
|
||||
// 1. reset current state.
|
||||
routingInfoVersion_ = routingInfo->routingInfoVersion;
|
||||
chainToTarget_.clear();
|
||||
syncingChains_.clear();
|
||||
robin_hood::unordered_set<TargetId> headTargets;
|
||||
robin_hood::unordered_set<TargetId> tailTargets;
|
||||
robin_hood::unordered_set<TargetId> lastSrvTargets;
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (target.isHead) {
|
||||
headTargets.insert(target.targetId);
|
||||
}
|
||||
if (target.isTail) {
|
||||
tailTargets.insert(target.targetId);
|
||||
}
|
||||
if (target.publicState == flat::PublicTargetState::LASTSRV) {
|
||||
lastSrvTargets.insert(target.targetId);
|
||||
}
|
||||
target.isHead = false;
|
||||
target.isTail = false;
|
||||
target.vChainId = VersionedChainId{};
|
||||
target.publicState = flat::PublicTargetState::INVALID;
|
||||
target.successor = std::nullopt;
|
||||
}
|
||||
bool invalidRoutingInfo = false;
|
||||
auto invalidRoutingInfoLogGuard = folly::makeGuard([&] {
|
||||
if (invalidRoutingInfo) {
|
||||
XLOGF(CRITICAL, "invalid routing info: {}", *routingInfo);
|
||||
}
|
||||
});
|
||||
|
||||
// 2. iterate routing info.
|
||||
for (auto &[id, chain] : routingInfo->chains) {
|
||||
// 3. find target in chain.
|
||||
auto it = std::find_if(chain.targets.begin(), chain.targets.end(), [&](const flat::ChainTargetInfo &targetInfo) {
|
||||
return bool(getMutableTarget(targetInfo.targetId));
|
||||
});
|
||||
if (it == chain.targets.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 4. find target info.
|
||||
auto targetId = it->targetId;
|
||||
auto targetInfo = routingInfo->getTarget(targetId);
|
||||
if (UNLIKELY(!targetInfo)) {
|
||||
auto msg = fmt::format("targetInfo id {} not found", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
// 5. update local target.
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
bool targetIsServing = targetInfo->publicState == flat::PublicTargetState::SERVING ||
|
||||
targetInfo->publicState == flat::PublicTargetState::SYNCING;
|
||||
auto previousLocalState = target->localState;
|
||||
target->isHead = (targetIsServing && it == chain.targets.begin());
|
||||
target->vChainId = VersionedChainId{chain.chainId, chain.chainVersion};
|
||||
if (target->storageTarget != nullptr) {
|
||||
if (target->storageTarget->chainId() == ChainId{}) {
|
||||
RETURN_AND_LOG_ON_ERROR(target->storageTarget->setChainId(chain.chainId));
|
||||
}
|
||||
if (target->storageTarget->chainId() != chain.chainId) {
|
||||
auto msg = fmt::format("target.chain != routing.chain, target {}, chain {}", *target, chain);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
}
|
||||
target->localState = updateLocalState(targetId, previousLocalState, targetInfo->publicState);
|
||||
target->publicState = targetInfo->publicState;
|
||||
auto [chainToTargetIt, succ] = chainToTarget_.emplace(chain.chainId, targetId);
|
||||
if (!succ) {
|
||||
auto msg = fmt::format("chain {} map to 2 targets {}, {}", chain.chainId, chainToTargetIt->second, targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
if (previousLocalState != flat::LocalTargetState::OFFLINE &&
|
||||
target->localState == flat::LocalTargetState::OFFLINE) {
|
||||
target->weakStorageTarget = target->storageTarget->aliveWeakPtr();
|
||||
target->storageTarget = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 6. update successor.
|
||||
while (targetIsServing && ++it != chain.targets.end()) {
|
||||
auto targetInfo = routingInfo->getTarget(it->targetId);
|
||||
if (UNLIKELY(!targetInfo)) {
|
||||
auto msg = fmt::format("successor {} not found", it->targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
if (targetInfo->publicState == flat::PublicTargetState::SERVING) {
|
||||
target->successor = Successor{{}, *targetInfo};
|
||||
} else if (targetInfo->publicState == flat::PublicTargetState::SYNCING) {
|
||||
target->successor = Successor{{}, *targetInfo};
|
||||
syncingChains_.push_back(VersionedChainId{chain.chainId, chain.chainVersion});
|
||||
}
|
||||
|
||||
if (target->successor) {
|
||||
if (!targetInfo->nodeId.has_value()) {
|
||||
XLOGF(WARNING, "target {} node id is nullopt", it->targetId);
|
||||
break;
|
||||
}
|
||||
auto node = routingInfo->getNode(*targetInfo->nodeId);
|
||||
if (!node) {
|
||||
XLOGF(WARNING, "node {} not found", targetInfo->nodeId);
|
||||
break;
|
||||
}
|
||||
target->successor->nodeInfo = *node;
|
||||
if (UNLIKELY(target->successor->nodeInfo.app.serviceGroups.empty())) {
|
||||
XLOGF(CRITICAL, "successor invalid! chain {}, successor {}, node {}", chain.chainId, *targetInfo, *node);
|
||||
invalidRoutingInfo = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
target->isTail = (targetIsServing && !target->successor.has_value());
|
||||
|
||||
if (headTargets.contains(targetId) ^ target->isHead) {
|
||||
if (target->isHead) {
|
||||
XLOGF_IF(WARNING, log, "target {} becomes head", targetId);
|
||||
} else {
|
||||
XLOGF_IF(WARNING, log, "target {} is no longer head", targetId);
|
||||
}
|
||||
}
|
||||
if (tailTargets.contains(targetId) ^ target->isTail) {
|
||||
if (target->isTail) {
|
||||
XLOGF_IF(WARNING, log, "target {} becomes tail", targetId);
|
||||
} else {
|
||||
XLOGF_IF(WARNING, log, "target {} is no longer tail", targetId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (lastSrvTargets.contains(targetId) && target.storageTarget &&
|
||||
(target.publicState == flat::PublicTargetState::SERVING ||
|
||||
target.publicState == flat::PublicTargetState::SYNCING ||
|
||||
target.publicState == flat::PublicTargetState::WAITING)) {
|
||||
target.storageTarget->resetUncommitted(target.vChainId.chainVer);
|
||||
}
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::removeTarget(TargetId targetId) {
|
||||
auto succ = targets_.erase(targetId);
|
||||
if (succ != 1) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::offlineTarget(TargetId targetId) {
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
if (target->unrecoverableOffline()) {
|
||||
return makeError(StorageCode::kTargetOffline, fmt::format("target is already offline, {}.", *target));
|
||||
}
|
||||
|
||||
target->offlineUponUserRequest = true;
|
||||
target->localState = flat::LocalTargetState::OFFLINE;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::offlineTargets(const Path &path) {
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
|
||||
target.diskError = true;
|
||||
target.localState = flat::LocalTargetState::OFFLINE;
|
||||
XLOGF(WARNING, "offline target {} because of disk error", target.path);
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
|
||||
target.lowSpace = lowSpace;
|
||||
auto old = std::exchange(target.rejectCreateChunk, rejectCreateChunk);
|
||||
if (old != rejectCreateChunk) {
|
||||
XLOGF(WARNING, "target {} reject create chunk {} -> {}", target.path, old, rejectCreateChunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
hf3fs::flat::LocalTargetState TargetMap::updateLocalState(TargetId targetId,
|
||||
hf3fs::flat::LocalTargetState localState,
|
||||
hf3fs::flat::PublicTargetState publicState) {
|
||||
if (localState == hf3fs::flat::LocalTargetState::UPTODATE &&
|
||||
(publicState == hf3fs::flat::PublicTargetState::OFFLINE ||
|
||||
publicState == hf3fs::flat::PublicTargetState::LASTSRV ||
|
||||
publicState == hf3fs::flat::PublicTargetState::WAITING)) {
|
||||
XLOGF(CRITICAL,
|
||||
"move to offline state (shutdown), local target: {}, local state: {} -> OFFLINE, public state: {}",
|
||||
targetId,
|
||||
magic_enum::enum_name(localState),
|
||||
magic_enum::enum_name(publicState));
|
||||
return hf3fs::flat::LocalTargetState::OFFLINE;
|
||||
} else if (localState == hf3fs::flat::LocalTargetState::ONLINE &&
|
||||
publicState == hf3fs::flat::PublicTargetState::SERVING) {
|
||||
XLOGF(INFO,
|
||||
"move to up-to-date state, local target: {}, local state: {} -> UPTODATE, public state: {}",
|
||||
targetId,
|
||||
magic_enum::enum_name(localState),
|
||||
magic_enum::enum_name(publicState));
|
||||
return hf3fs::flat::LocalTargetState::UPTODATE;
|
||||
}
|
||||
return localState;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByChainId(
|
||||
VersionedChainId vChainId,
|
||||
bool allowOutdatedChainVer /* = false */) const {
|
||||
auto map = snapshot();
|
||||
auto result = map->getByChainId(vChainId, allowOutdatedChainVer);
|
||||
RETURN_ON_ERROR(result);
|
||||
return std::shared_ptr<const Target>(std::move(map), *result);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByTargetId(TargetId targetId) const {
|
||||
auto map = snapshot();
|
||||
auto result = map->getTarget(targetId);
|
||||
RETURN_ON_ERROR(result);
|
||||
return std::shared_ptr<const Target>(std::move(map), *result);
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateTargetMap(auto &&updateFunc) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
auto map = snapshot();
|
||||
while (true) {
|
||||
auto newMap = map->clone();
|
||||
RETURN_AND_LOG_ON_ERROR(updateFunc(newMap));
|
||||
if (targetMap_.compare_exchange_strong(map, std::move(newMap))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
updateCallback_(*snapshot());
|
||||
return Void{};
|
||||
};
|
||||
|
||||
Result<Void> AtomicallyTargetMap::addStorageTarget(std::shared_ptr<StorageTarget> storageTarget) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->addStorageTarget(storageTarget); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::syncReceiveDone(VersionedChainId vChainId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->syncReceiveDone(vChainId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateRouting(r); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::removeTarget(TargetId targetId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->removeTarget(targetId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::offlineTarget(TargetId targetId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTarget(targetId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::offlineTargets(const Path &path) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTargets(path); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
|
||||
return updateTargetMap(
|
||||
[&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateDiskState(path, lowSpace, rejectCreateChunk); });
|
||||
}
|
||||
|
||||
void AtomicallyTargetMap::updateTargetUsedSize() {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
updateCallback_(*snapshot());
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
137
src/storage/service/TargetMap.h
Normal file
137
src/storage/service/TargetMap.h
Normal file
@@ -0,0 +1,137 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <common/utils/RobinHood.h>
|
||||
#include <folly/concurrency/AtomicSharedPtr.h>
|
||||
#include <memory>
|
||||
|
||||
#include "client/mgmtd/RoutingInfo.h"
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/ConstructLog.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
#include "fbs/mgmtd/NodeInfo.h"
|
||||
#include "fbs/mgmtd/TargetInfo.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/store/StorageTarget.h"
|
||||
|
||||
namespace hf3fs::test {
|
||||
struct TargetMapHelper;
|
||||
}
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class TargetMap {
|
||||
public:
|
||||
// [observers] clone current map.
|
||||
std::shared_ptr<TargetMap> clone() const { return std::make_shared<TargetMap>(*this); }
|
||||
|
||||
// [observers] get target id by chain id.
|
||||
Result<TargetId> getTargetId(ChainId chainId) const;
|
||||
|
||||
// [observers] get target by target id.
|
||||
Result<const Target *> getTarget(TargetId targetId) const;
|
||||
|
||||
// [observers] get target by versioned chain id.
|
||||
Result<const Target *> getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const;
|
||||
|
||||
// [observers]
|
||||
auto &getTargets() const { return targets_; }
|
||||
|
||||
// [observers]
|
||||
auto &syncingChains() const { return syncingChains_; }
|
||||
|
||||
// [modifiers] add a new target.
|
||||
Result<Void> addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget);
|
||||
|
||||
// [modifiers] get target by target id.
|
||||
Result<Target *> getMutableTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] sync receive is started.
|
||||
Result<Void> syncReceiveDone(VersionedChainId vChainId);
|
||||
|
||||
// [modifiers] update by routing info.
|
||||
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log = true);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> removeTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> offlineTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set targets in path as offline.
|
||||
Result<Void> offlineTargets(const Path &path);
|
||||
|
||||
// [modifiers] reject create chunk for targets in path.
|
||||
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
|
||||
|
||||
// update local state.
|
||||
static hf3fs::flat::LocalTargetState updateLocalState(TargetId targetId,
|
||||
hf3fs::flat::LocalTargetState localState,
|
||||
hf3fs::flat::PublicTargetState publicState);
|
||||
|
||||
private:
|
||||
friend struct test::TargetMapHelper;
|
||||
robin_hood::unordered_map<TargetId, Target> targets_;
|
||||
flat::RoutingInfoVersion routingInfoVersion_;
|
||||
robin_hood::unordered_map<ChainId, TargetId> chainToTarget_;
|
||||
std::vector<VersionedChainId> syncingChains_;
|
||||
};
|
||||
|
||||
class AtomicallyTargetMap {
|
||||
public:
|
||||
// [observers] get a snapshot of target map.
|
||||
auto snapshot() const { return targetMap_.load(); }
|
||||
|
||||
// [observers] get target by chain id.
|
||||
Result<std::shared_ptr<const Target>> getByChainId(VersionedChainId vChainId,
|
||||
bool allowOutdatedChainVer = false) const;
|
||||
|
||||
// [observers] get target by its id.
|
||||
Result<std::shared_ptr<const Target>> getByTargetId(TargetId targetId) const;
|
||||
|
||||
// [modifiers] set update callback.
|
||||
void setUpdateCallback(auto &&func) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
updateCallback_ = std::forward<decltype(func)>(func);
|
||||
}
|
||||
|
||||
// [modifiers] add a target.
|
||||
Result<Void> addStorageTarget(std::shared_ptr<StorageTarget> storageTarget);
|
||||
|
||||
// [modifiers] sync receive is done.
|
||||
Result<Void> syncReceiveDone(VersionedChainId vChainId);
|
||||
|
||||
// [modifiers] update by routing info.
|
||||
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> removeTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> offlineTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set targets in path as offline.
|
||||
Result<Void> offlineTargets(const Path &path);
|
||||
|
||||
// [modifiers] reject create chunk for targets in path.
|
||||
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
|
||||
|
||||
// [modifiers] update target used size.
|
||||
void updateTargetUsedSize();
|
||||
|
||||
// [modifiers] release target map.
|
||||
auto release() { return targetMap_.exchange(nullptr); }
|
||||
|
||||
protected:
|
||||
// [modifiers] update target map atomically.
|
||||
Result<Void> updateTargetMap(auto &&updateFunc);
|
||||
|
||||
private:
|
||||
friend struct test::TargetMapHelper;
|
||||
ConstructLog<"storage::AtomicallyTargetMap"> constructLog_;
|
||||
std::mutex mutex_; // for update operation.
|
||||
std::function<void(const TargetMap &)> updateCallback_ = [](auto) {};
|
||||
folly::atomic_shared_ptr<const TargetMap> targetMap_{std::make_shared<const TargetMap>()};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
Reference in New Issue
Block a user