mirror of
https://github.com/deepseek-ai/3FS
synced 2025-06-26 18:16:45 +00:00
Initial commit
This commit is contained in:
8
src/fuse/CMakeLists.txt
Normal file
8
src/fuse/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
link_directories(/usr/local/lib/x86_64-linux-gnu/)
|
||||
|
||||
target_add_lib(hf3fs_fuse common core-app meta-client storage-client fuse3 client-lib-common)
|
||||
target_add_bin(hf3fs_fuse_main hf3fs_fuse.cpp hf3fs_fuse)
|
||||
|
||||
if (ENABLE_FUSE_APPLICATION)
|
||||
target_compile_definitions(hf3fs_fuse PUBLIC -DENABLE_FUSE_APPLICATION)
|
||||
endif()
|
||||
10
src/fuse/FuseAppConfig.cc
Normal file
10
src/fuse/FuseAppConfig.cc
Normal file
@@ -0,0 +1,10 @@
|
||||
#include "FuseAppConfig.h"
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
void FuseAppConfig::init(const String &filePath, bool dump, const std::vector<config::KeyValue> &updates) {
|
||||
auto res = ApplicationBase::initConfig(*this, filePath, dump, updates);
|
||||
XLOGF_IF(FATAL, !res, "Init app config failed: {}. filePath: {}. dump: {}", res.error(), filePath, dump);
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
16
src/fuse/FuseAppConfig.h
Normal file
16
src/fuse/FuseAppConfig.h
Normal file
@@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/app/NodeId.h"
|
||||
#include "common/net/ib/IBDevice.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
struct FuseAppConfig : public ConfigBase<FuseAppConfig> {
|
||||
public:
|
||||
using Base = ConfigBase<FuseAppConfig>;
|
||||
using Base::init;
|
||||
|
||||
void init(const String &filePath, bool dump, const std::vector<config::KeyValue> &updates);
|
||||
flat::NodeId getNodeId() const { return flat::NodeId(0); }
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
125
src/fuse/FuseApplication.cc
Normal file
125
src/fuse/FuseApplication.cc
Normal file
@@ -0,0 +1,125 @@
|
||||
#ifdef ENABLE_FUSE_APPLICATION
|
||||
|
||||
#include "FuseApplication.h"
|
||||
|
||||
#include "FuseMainLoop.h"
|
||||
#include "FuseOps.h"
|
||||
#include "common/app/Thread.h"
|
||||
#include "common/app/Utils.h"
|
||||
|
||||
DECLARE_string(cfg);
|
||||
DECLARE_bool(dump_default_cfg);
|
||||
DECLARE_bool(use_local_cfg);
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
|
||||
struct FuseApplication::Impl {
|
||||
Result<Void> parseFlags(int *argc, char ***argv);
|
||||
Result<Void> initApplication();
|
||||
Result<Void> initFuseClients();
|
||||
void stop();
|
||||
int mainLoop();
|
||||
|
||||
Config hf3fsConfig;
|
||||
flat::AppInfo appInfo;
|
||||
std::unique_ptr<Launcher> launcher_ = std::make_unique<Launcher>();
|
||||
|
||||
std::unique_ptr<ConfigCallbackGuard> onLogConfigUpdated_;
|
||||
std::unique_ptr<ConfigCallbackGuard> onMemConfigUpdated_;
|
||||
|
||||
ConfigFlags configFlags_;
|
||||
String programName;
|
||||
bool allowOther = false;
|
||||
String configMountpoint;
|
||||
size_t configMaxBufSize = 0;
|
||||
String configClusterId;
|
||||
};
|
||||
|
||||
FuseApplication::FuseApplication()
|
||||
: impl_(std::make_unique<Impl>()) {}
|
||||
|
||||
FuseApplication::~FuseApplication() = default;
|
||||
|
||||
Result<Void> FuseApplication::Impl::parseFlags(int *argc, char ***argv) {
|
||||
RETURN_ON_ERROR(launcher_->parseFlags(argc, argv));
|
||||
|
||||
static constexpr std::string_view dynamicConfigPrefix = "--config.";
|
||||
RETURN_ON_ERROR(ApplicationBase::parseFlags(dynamicConfigPrefix, argc, argv, configFlags_));
|
||||
|
||||
programName = (*argv)[0];
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> FuseApplication::parseFlags(int *argc, char ***argv) { return impl_->parseFlags(argc, argv); }
|
||||
|
||||
Result<Void> FuseApplication::Impl::initApplication() {
|
||||
if (FLAGS_dump_default_cfg) {
|
||||
fmt::print("{}\n", hf3fsConfig.toString());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
auto firstInitRes = launcher_->init();
|
||||
XLOGF_IF(FATAL, !firstInitRes, "Failed to init launcher: {}", firstInitRes.error());
|
||||
|
||||
app_detail::loadAppInfo([this] { return launcher_->loadAppInfo(); }, appInfo);
|
||||
app_detail::initConfig(hf3fsConfig, configFlags_, appInfo, [this] { return launcher_->loadConfigTemplate(); });
|
||||
XLOGF(INFO, "Server config inited");
|
||||
|
||||
app_detail::initCommonComponents(hf3fsConfig.common(), kName, appInfo.nodeId);
|
||||
|
||||
onLogConfigUpdated_ = app_detail::makeLogConfigUpdateCallback(hf3fsConfig.common().log(), kName);
|
||||
onMemConfigUpdated_ = app_detail::makeMemConfigUpdateCallback(hf3fsConfig.common().memory(), appInfo.hostname);
|
||||
|
||||
XLOGF(INFO, "Full Config:\n{}", hf3fsConfig.toString());
|
||||
app_detail::persistConfig(hf3fsConfig);
|
||||
|
||||
XLOGF(INFO, "Start to init fuse clients");
|
||||
auto initRes = initFuseClients();
|
||||
XLOGF_IF(FATAL, !initRes, "Init fuse clients failed: {}", initRes.error());
|
||||
XLOGF(INFO, "Init fuse clients finished");
|
||||
|
||||
launcher_.reset();
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> FuseApplication::Impl::initFuseClients() {
|
||||
const auto &launcherConfig = launcher_->launcherConfig();
|
||||
allowOther = launcherConfig.allow_other();
|
||||
configMountpoint = launcherConfig.mountpoint();
|
||||
configMaxBufSize = hf3fsConfig.io_bufs().max_buf_size();
|
||||
configClusterId = launcherConfig.cluster_id();
|
||||
|
||||
auto &d = getFuseClientsInstance();
|
||||
RETURN_ON_ERROR(d.init(appInfo, launcherConfig.mountpoint(), launcherConfig.token_file(), hf3fsConfig));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> FuseApplication::initApplication() { return impl_->initApplication(); }
|
||||
|
||||
void FuseApplication::Impl::stop() {
|
||||
getFuseClientsInstance().stop();
|
||||
hf3fs::stopAndJoin(nullptr);
|
||||
}
|
||||
|
||||
void FuseApplication::stop() { impl_->stop(); }
|
||||
|
||||
config::IConfig *FuseApplication::getConfig() { return &impl_->hf3fsConfig; }
|
||||
|
||||
const flat::AppInfo *FuseApplication::info() const { return &impl_->appInfo; }
|
||||
|
||||
bool FuseApplication::configPushable() const { return FLAGS_cfg.empty() && !FLAGS_use_local_cfg; }
|
||||
|
||||
void FuseApplication::onConfigUpdated() { app_detail::persistConfig(impl_->hf3fsConfig); }
|
||||
|
||||
int FuseApplication::Impl::mainLoop() {
|
||||
Thread::unblockInterruptSignals();
|
||||
|
||||
return fuseMainLoop(programName, allowOther, configMountpoint, configMaxBufSize, configClusterId);
|
||||
}
|
||||
|
||||
int FuseApplication::mainLoop() { return impl_->mainLoop(); }
|
||||
|
||||
} // namespace hf3fs::fuse
|
||||
|
||||
#endif
|
||||
55
src/fuse/FuseApplication.h
Normal file
55
src/fuse/FuseApplication.h
Normal file
@@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef ENABLE_FUSE_APPLICATION
|
||||
|
||||
#include "FuseAppConfig.h"
|
||||
#include "FuseConfig.h"
|
||||
#include "FuseConfigFetcher.h"
|
||||
#include "FuseLauncherConfig.h"
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "core/app/ServerLauncher.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
class FuseApplication : public ApplicationBase {
|
||||
public:
|
||||
static constexpr auto kName = "Fuse";
|
||||
static constexpr auto kNodeType = flat::NodeType::FUSE;
|
||||
|
||||
using AppConfig = FuseAppConfig;
|
||||
using LauncherConfig = FuseLauncherConfig;
|
||||
using RemoteConfigFetcher = FuseConfigFetcher;
|
||||
using Launcher = core::ServerLauncher<FuseApplication>;
|
||||
|
||||
using Config = FuseConfig;
|
||||
|
||||
FuseApplication();
|
||||
~FuseApplication();
|
||||
|
||||
private:
|
||||
Result<Void> parseFlags(int *argc, char ***argv) final;
|
||||
|
||||
Result<Void> initApplication() final;
|
||||
|
||||
void stop() final;
|
||||
|
||||
int mainLoop() final;
|
||||
|
||||
config::IConfig *getConfig() final;
|
||||
|
||||
const flat::AppInfo *info() const final;
|
||||
|
||||
bool configPushable() const final;
|
||||
|
||||
void onConfigUpdated() final;
|
||||
|
||||
private:
|
||||
Result<Void> initServer();
|
||||
|
||||
Result<Void> startServer();
|
||||
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
|
||||
#endif
|
||||
440
src/fuse/FuseClients.cc
Normal file
440
src/fuse/FuseClients.cc
Normal file
@@ -0,0 +1,440 @@
|
||||
#include "FuseClients.h"
|
||||
|
||||
#include <folly/Random.h>
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/executors/IOThreadPoolExecutor.h>
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/functional/Partial.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
#include <fuse3/fuse_lowlevel.h>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/BackgroundRunner.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/FileUtils.h"
|
||||
#include "common/utils/SysResource.h"
|
||||
#include "fbs/meta/Common.h"
|
||||
#include "fbs/mgmtd/Rpc.h"
|
||||
#include "stubs/MetaService/MetaServiceStub.h"
|
||||
#include "stubs/common/RealStubFactory.h"
|
||||
#include "stubs/mgmtd/MgmtdServiceStub.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
namespace {
|
||||
monitor::ValueRecorder dirtyInodesCnt("fuse.dirty_inodes");
|
||||
|
||||
Result<Void> establishClientSession(client::IMgmtdClientForClient &mgmtdClient) {
|
||||
return folly::coro::blockingWait([&]() -> CoTryTask<void> {
|
||||
auto retryInterval = std::chrono::milliseconds(10);
|
||||
constexpr auto maxRetryInterval = std::chrono::milliseconds(1000);
|
||||
Result<Void> res = Void{};
|
||||
for (int i = 0; i < 40; ++i) {
|
||||
res = co_await mgmtdClient.extendClientSession();
|
||||
if (res) break;
|
||||
XLOGF(CRITICAL, "Try to establish client session failed: {}\nretryCount: {}", res.error(), i);
|
||||
co_await folly::coro::sleep(retryInterval);
|
||||
retryInterval = std::min(2 * retryInterval, maxRetryInterval);
|
||||
}
|
||||
co_return res;
|
||||
}());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
FuseClients::~FuseClients() { stop(); }
|
||||
|
||||
Result<Void> FuseClients::init(const flat::AppInfo &appInfo,
|
||||
const String &mountPoint,
|
||||
const String &tokenFile,
|
||||
FuseConfig &fuseConfig) {
|
||||
config = &fuseConfig;
|
||||
|
||||
fuseMount = appInfo.clusterId;
|
||||
XLOGF_IF(FATAL,
|
||||
fuseMount.size() >= 32,
|
||||
"FUSE only support mount name shorter than 32 characters, but {} got.",
|
||||
fuseMount);
|
||||
|
||||
fuseMountpoint = Path(mountPoint).lexically_normal();
|
||||
|
||||
if (fuseConfig.remount_prefix()) {
|
||||
fuseRemountPref = Path(*fuseConfig.remount_prefix()).lexically_normal();
|
||||
}
|
||||
|
||||
if (const char *env_p = std::getenv("HF3FS_FUSE_TOKEN")) {
|
||||
XLOGF(INFO, "Use token from env var");
|
||||
fuseToken = std::string(env_p);
|
||||
} else {
|
||||
XLOGF(INFO, "Use token from config");
|
||||
auto tokenRes = loadFile(tokenFile);
|
||||
RETURN_ON_ERROR(tokenRes);
|
||||
fuseToken = folly::trimWhitespace(*tokenRes);
|
||||
}
|
||||
enableWritebackCache = fuseConfig.enable_writeback_cache();
|
||||
memsetBeforeRead = fuseConfig.memset_before_read();
|
||||
maxIdleThreads = fuseConfig.max_idle_threads();
|
||||
int logicalCores = std::thread::hardware_concurrency();
|
||||
if (logicalCores != 0) {
|
||||
maxThreads = std::min(fuseConfig.max_threads(), (logicalCores + 1) / 2);
|
||||
} else {
|
||||
maxThreads = fuseConfig.max_threads();
|
||||
}
|
||||
bufPool = net::RDMABufPool::create(fuseConfig.io_bufs().max_buf_size(), fuseConfig.rdma_buf_pool_size());
|
||||
|
||||
iovs.init(fuseRemountPref.value_or(fuseMountpoint), fuseConfig.iov_limit());
|
||||
iors.init(fuseConfig.iov_limit());
|
||||
userConfig.init(fuseConfig);
|
||||
|
||||
if (!client) {
|
||||
client = std::make_unique<net::Client>(fuseConfig.client());
|
||||
RETURN_ON_ERROR(client->start());
|
||||
}
|
||||
auto ctxCreator = [this](net::Address addr) { return client->serdeCtx(addr); };
|
||||
if (!mgmtdClient) {
|
||||
mgmtdClient = std::make_shared<client::MgmtdClientForClient>(
|
||||
appInfo.clusterId,
|
||||
std::make_unique<stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(ctxCreator),
|
||||
fuseConfig.mgmtd());
|
||||
}
|
||||
|
||||
auto physicalHostnameRes = SysResource::hostname(/*physicalMachineName=*/true);
|
||||
RETURN_ON_ERROR(physicalHostnameRes);
|
||||
|
||||
auto containerHostnameRes = SysResource::hostname(/*physicalMachineName=*/false);
|
||||
RETURN_ON_ERROR(containerHostnameRes);
|
||||
|
||||
auto clientId = ClientId::random(*physicalHostnameRes);
|
||||
|
||||
mgmtdClient->setClientSessionPayload({clientId.uuid.toHexString(),
|
||||
flat::NodeType::FUSE,
|
||||
flat::ClientSessionData::create(
|
||||
/*universalId=*/*physicalHostnameRes,
|
||||
/*description=*/fmt::format("fuse: {}", *containerHostnameRes),
|
||||
appInfo.serviceGroups,
|
||||
appInfo.releaseVersion),
|
||||
// TODO: use real user info
|
||||
flat::UserInfo{}});
|
||||
|
||||
mgmtdClient->setConfigListener(ApplicationBase::updateConfig);
|
||||
|
||||
folly::coro::blockingWait(mgmtdClient->start(&client->tpg().bgThreadPool().randomPick()));
|
||||
folly::coro::blockingWait(mgmtdClient->refreshRoutingInfo(/*force=*/false));
|
||||
RETURN_ON_ERROR(establishClientSession(*mgmtdClient));
|
||||
|
||||
storageClient = storage::client::StorageClient::create(clientId, fuseConfig.storage(), *mgmtdClient);
|
||||
|
||||
metaClient =
|
||||
std::make_shared<meta::client::MetaClient>(clientId,
|
||||
fuseConfig.meta(),
|
||||
std::make_unique<meta::client::MetaClient::StubFactory>(ctxCreator),
|
||||
mgmtdClient,
|
||||
storageClient,
|
||||
true /* dynStripe */);
|
||||
metaClient->start(client->tpg().bgThreadPool());
|
||||
|
||||
iojqs.reserve(3);
|
||||
iojqs.emplace_back(new BoundedQueue<IoRingJob>(fuseConfig.io_jobq_sizes().hi()));
|
||||
iojqs.emplace_back(new BoundedQueue<IoRingJob>(fuseConfig.io_jobq_size()));
|
||||
iojqs.emplace_back(new BoundedQueue<IoRingJob>(fuseConfig.io_jobq_sizes().lo()));
|
||||
|
||||
jitter = fuseConfig.submit_wait_jitter();
|
||||
|
||||
auto &tp = client->tpg().bgThreadPool();
|
||||
auto coros = fuseConfig.batch_io_coros();
|
||||
for (int i = 0; i < coros; ++i) {
|
||||
auto exec = &tp.get(i % tp.size());
|
||||
co_withCancellation(cancelIos.getToken(), ioRingWorker(i, coros)).scheduleOn(exec).start();
|
||||
}
|
||||
|
||||
ioWatches.reserve(3);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ioWatches.emplace_back(folly::partial(&FuseClients::watch, this, i));
|
||||
}
|
||||
|
||||
periodicSyncWorker = std::make_unique<CoroutinesPool<InodeId>>(config->periodic_sync().worker());
|
||||
periodicSyncWorker->start(folly::partial(&FuseClients::periodicSync, this), tp);
|
||||
|
||||
periodicSyncRunner = std::make_unique<BackgroundRunner>(&tp.pickNextFree());
|
||||
periodicSyncRunner->start("PeriodSync", folly::partial(&FuseClients::periodicSyncScan, this), [&]() {
|
||||
return config->periodic_sync().interval() * folly::Random::randDouble(0.7, 1.3);
|
||||
});
|
||||
|
||||
onFuseConfigUpdated = fuseConfig.addCallbackGuard([&fuseConfig = fuseConfig, this] {
|
||||
memsetBeforeRead = fuseConfig.memset_before_read();
|
||||
jitter = std::chrono::duration_cast<std::chrono::nanoseconds>(fuseConfig.submit_wait_jitter());
|
||||
});
|
||||
|
||||
notifyInvalExec =
|
||||
std::make_unique<folly::IOThreadPoolExecutor>(fuseConfig.notify_inval_threads(),
|
||||
std::make_shared<folly::NamedThreadFactory>("NotifyInvalThread"));
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void FuseClients::stop() {
|
||||
if (notifyInvalExec) {
|
||||
notifyInvalExec->stop();
|
||||
notifyInvalExec.reset();
|
||||
}
|
||||
if (onFuseConfigUpdated) {
|
||||
onFuseConfigUpdated.reset();
|
||||
}
|
||||
|
||||
cancelIos.requestCancellation();
|
||||
|
||||
for (auto &t : ioWatches) {
|
||||
t.request_stop();
|
||||
}
|
||||
if (periodicSyncRunner) {
|
||||
folly::coro::blockingWait(periodicSyncRunner->stopAll());
|
||||
periodicSyncRunner.reset();
|
||||
}
|
||||
if (periodicSyncWorker) {
|
||||
periodicSyncWorker->stopAndJoin();
|
||||
periodicSyncWorker.reset();
|
||||
}
|
||||
if (metaClient) {
|
||||
metaClient->stop();
|
||||
metaClient.reset();
|
||||
}
|
||||
if (storageClient) {
|
||||
storageClient->stop();
|
||||
storageClient.reset();
|
||||
}
|
||||
if (mgmtdClient) {
|
||||
folly::coro::blockingWait(mgmtdClient->stop());
|
||||
mgmtdClient.reset();
|
||||
}
|
||||
if (client) {
|
||||
client->stopAndJoin();
|
||||
client.reset();
|
||||
}
|
||||
}
|
||||
|
||||
CoTask<void> FuseClients::ioRingWorker(int i, int ths) {
|
||||
// a worker thread has its own priority, but it can also execute jobs from queues with a higher priority
|
||||
// checkHigher is used to make sure the job queue with the thread's own priority doesn't starve
|
||||
bool checkHigher = true;
|
||||
|
||||
while (true) {
|
||||
auto res = co_await folly::coro::co_awaitTry([this, &checkHigher, i, ths]() -> CoTask<void> {
|
||||
IoRingJob job;
|
||||
auto hiThs = config->io_worker_coros().hi(), loThs = config->io_worker_coros().lo();
|
||||
auto prio = i < hiThs ? 0 : i < (ths - loThs) ? 1 : 2;
|
||||
if (!config->enable_priority()) {
|
||||
job = co_await iojqs[prio]->co_dequeue();
|
||||
} else {
|
||||
bool gotJob = false;
|
||||
|
||||
// if checkHigher, dequeue from a higher job queue if it is full
|
||||
while (!gotJob) {
|
||||
if (checkHigher) {
|
||||
for (int nprio = 0; nprio < prio; ++nprio) {
|
||||
if (iojqs[nprio]->full()) {
|
||||
auto dres = iojqs[nprio]->try_dequeue();
|
||||
if (dres) {
|
||||
// got a job from higher priority queue, next time pick a same priority job unless the queue is empty
|
||||
checkHigher = false;
|
||||
gotJob = true;
|
||||
job = std::move(*dres);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (gotJob) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if checkHigher, check from higher prio to lower; otherwise, reverse the checking direction
|
||||
for (int nprio = checkHigher ? 0 : prio; checkHigher ? nprio <= prio : nprio >= 0;
|
||||
nprio += checkHigher ? 1 : -1) {
|
||||
auto [sres, dres] =
|
||||
co_await folly::coro::collectAnyNoDiscard(folly::coro::sleep(config->io_job_deq_timeout()),
|
||||
iojqs[nprio]->co_dequeue());
|
||||
if (dres.hasValue()) {
|
||||
// if the job is the thread's own priority, next time it can check from higher priority queues
|
||||
if (!checkHigher && nprio == prio) {
|
||||
checkHigher = true;
|
||||
}
|
||||
gotJob = true;
|
||||
job = std::move(*dres);
|
||||
break;
|
||||
} else if (sres.hasValue()) {
|
||||
continue;
|
||||
} else {
|
||||
dres.throwUnlessValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto lookupFiles =
|
||||
[this](std::vector<std::shared_ptr<RcInode>> &ins, const IoArgs *args, const IoSqe *sqes, int sqec) {
|
||||
auto lastIid = 0ull;
|
||||
|
||||
std::lock_guard lock(inodesMutex);
|
||||
for (int i = 0; i < sqec; ++i) {
|
||||
auto idn = args[sqes[i].index].fileIid;
|
||||
if (i && idn == lastIid) {
|
||||
ins.emplace_back(ins.back());
|
||||
continue;
|
||||
}
|
||||
|
||||
lastIid = idn;
|
||||
auto iid = meta::InodeId(idn);
|
||||
auto it = inodes.find(iid);
|
||||
ins.push_back(it == inodes.end() ? (std::shared_ptr<RcInode>()) : it->second);
|
||||
}
|
||||
};
|
||||
auto lookupBufs =
|
||||
[this](std::vector<Result<lib::ShmBufForIO>> &bufs, const IoArgs *args, const IoSqe *sqe, int sqec) {
|
||||
auto lastId = Uuid::zero();
|
||||
std::shared_ptr<lib::ShmBuf> lastShm;
|
||||
|
||||
std::lock_guard lock(iovs.shmLock);
|
||||
for (int i = 0; i < sqec; ++i) {
|
||||
auto &arg = args[sqe[i].index];
|
||||
Uuid id;
|
||||
memcpy(id.data, arg.bufId, sizeof(id.data));
|
||||
|
||||
std::shared_ptr<lib::ShmBuf> shm;
|
||||
if (i && id == lastId) {
|
||||
shm = lastShm;
|
||||
} else {
|
||||
auto it = iovs.shmsById.find(id);
|
||||
if (it == iovs.shmsById.end()) {
|
||||
bufs.emplace_back(makeError(StatusCode::kInvalidArg, "buf id not found"));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto iovd = it->second;
|
||||
shm = iovs.iovs->table[iovd].load();
|
||||
if (!shm) {
|
||||
bufs.emplace_back(makeError(StatusCode::kInvalidArg, "buf id not found"));
|
||||
continue;
|
||||
} else if (shm->size < arg.bufOff + arg.ioLen) {
|
||||
bufs.emplace_back(makeError(StatusCode::kInvalidArg, "invalid buf off and/or io len"));
|
||||
continue;
|
||||
}
|
||||
|
||||
lastId = id;
|
||||
lastShm = shm;
|
||||
}
|
||||
|
||||
bufs.emplace_back(lib::ShmBufForIO(std::move(shm), arg.bufOff));
|
||||
}
|
||||
};
|
||||
|
||||
co_await job.ior->process(job.sqeProcTail,
|
||||
job.toProc,
|
||||
*storageClient,
|
||||
config->storage_io(),
|
||||
userConfig,
|
||||
std::move(lookupFiles),
|
||||
std::move(lookupBufs));
|
||||
|
||||
if (iojqs[0]->full() || job.ior->priority != prio) {
|
||||
sem_post(iors.sems[job.ior->priority].get()); // wake the watchers
|
||||
} else {
|
||||
auto jobs = job.ior->jobsToProc(1);
|
||||
if (!jobs.empty()) {
|
||||
job = jobs.front();
|
||||
if (!iojqs[0]->try_enqueue(job)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}());
|
||||
if (UNLIKELY(res.hasException())) {
|
||||
XLOGF(INFO, "io worker #{} cancelled", i);
|
||||
if (res.hasException<OperationCancelled>()) {
|
||||
break;
|
||||
} else {
|
||||
XLOGF(FATAL, "got exception in io worker #{}", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FuseClients::watch(int prio, std::stop_token stop) {
|
||||
while (!stop.stop_requested()) {
|
||||
struct timespec ts;
|
||||
if (clock_gettime(CLOCK_REALTIME, &ts) < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nsec = ts.tv_nsec + jitter.load().count();
|
||||
ts.tv_nsec = nsec % 1000000000;
|
||||
ts.tv_sec += nsec / 1000000000;
|
||||
if (sem_timedwait(iors.sems[prio].get(), &ts) < 0 && errno == ETIMEDOUT) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto gotJobs = false;
|
||||
do {
|
||||
gotJobs = false;
|
||||
|
||||
auto n = iors.ioRings->slots.nextAvail.load();
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto ior = iors.ioRings->table[i].load();
|
||||
|
||||
if (ior && ior->priority == prio) {
|
||||
auto jobs = ior->jobsToProc(config->max_jobs_per_ioring());
|
||||
for (auto &&job : jobs) {
|
||||
gotJobs = true;
|
||||
iojqs[prio]->enqueue(std::move(job));
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (gotJobs); // loop till we found no more jobs and then block in the next iter
|
||||
}
|
||||
}
|
||||
|
||||
CoTask<void> FuseClients::periodicSyncScan() {
|
||||
if (!config->periodic_sync().enable() || config->readonly()) {
|
||||
co_return;
|
||||
}
|
||||
|
||||
XLOGF(INFO, "periodicSyncScan run");
|
||||
std::set<InodeId> dirty;
|
||||
{
|
||||
auto guard = dirtyInodes.lock();
|
||||
auto limit = config->periodic_sync().limit();
|
||||
dirtyInodesCnt.set(guard->size());
|
||||
if (guard->size() <= limit) {
|
||||
dirty = std::exchange(*guard, {});
|
||||
} else {
|
||||
XLOGF(WARN, "dirty inodes {} > limit {}", guard->size(), limit);
|
||||
auto iter = guard->find(lastSynced);
|
||||
while (dirty.size() < limit) {
|
||||
if (iter == guard->end()) {
|
||||
iter = guard->begin();
|
||||
XLOGF_IF(FATAL, iter == guard->end(), "iter == guard->end() shouldn't happen");
|
||||
} else {
|
||||
auto inode = *iter;
|
||||
lastSynced = inode;
|
||||
iter = guard->erase(iter);
|
||||
dirty.insert(inode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto inode : dirty) {
|
||||
co_await periodicSyncWorker->enqueue(inode);
|
||||
}
|
||||
|
||||
co_return;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::fuse
|
||||
243
src/fuse/FuseClients.h
Normal file
243
src/fuse/FuseClients.h
Normal file
@@ -0,0 +1,243 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <folly/MPMCQueue.h>
|
||||
#include <folly/Math.h>
|
||||
#include <folly/Synchronized.h>
|
||||
#include <folly/Utility.h>
|
||||
#include <folly/executors/IOThreadPoolExecutor.h>
|
||||
#include <folly/experimental/coro/Mutex.h>
|
||||
#include <folly/fibers/Semaphore.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <sys/types.h>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
#include "common/utils/BackgroundRunner.h"
|
||||
#include "common/utils/CoroutinesPool.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/Semaphore.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/core/user/User.h"
|
||||
#include "fbs/meta/Common.h"
|
||||
#define FUSE_USE_VERSION 312
|
||||
#define OP_LOG_LEVEL DBG
|
||||
|
||||
#include <folly/concurrency/AtomicSharedPtr.h>
|
||||
#include <fuse3/fuse_lowlevel.h>
|
||||
|
||||
#include "FuseConfig.h"
|
||||
#include "IoRing.h"
|
||||
#include "IovTable.h"
|
||||
#include "PioV.h"
|
||||
#include "UserConfig.h"
|
||||
#include "client/meta/MetaClient.h"
|
||||
#include "client/mgmtd/MgmtdClientForClient.h"
|
||||
#include "client/storage/StorageClient.h"
|
||||
#include "fbs/meta/Schema.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
using flat::Gid;
|
||||
using flat::Uid;
|
||||
using flat::UserInfo;
|
||||
using lib::agent::PioV;
|
||||
using meta::Acl;
|
||||
using meta::Directory;
|
||||
using meta::DirEntry;
|
||||
using meta::Inode;
|
||||
using meta::InodeData;
|
||||
using meta::InodeId;
|
||||
using meta::Permission;
|
||||
using storage::client::IOBuffer;
|
||||
|
||||
struct InodeWriteBuf {
|
||||
std::vector<uint8_t> buf;
|
||||
std::unique_ptr<storage::client::IOBuffer> memh;
|
||||
off_t off{0};
|
||||
size_t len{0};
|
||||
};
|
||||
|
||||
struct RcInode {
|
||||
struct DynamicAttr {
|
||||
uint64_t written = 0;
|
||||
uint64_t synced = 0; // period sync
|
||||
uint64_t fsynced = 0; // fsync, close, truncate, etc...
|
||||
flat::Uid writer = flat::Uid(0);
|
||||
|
||||
uint32_t dynStripe = 1; // dynamic stripe
|
||||
|
||||
uint64_t truncateVer = 0; // largest known truncate version.
|
||||
std::optional<meta::VersionedLength> hintLength; // local hint length
|
||||
std::optional<UtcTime> atime; // local read time, but only update for write open
|
||||
std::optional<UtcTime> mtime; // local write time
|
||||
|
||||
void update(const Inode &inode, uint64_t syncver = 0, bool fsync = false) {
|
||||
if (!inode.isFile()) {
|
||||
return;
|
||||
}
|
||||
|
||||
synced = std::max(synced, syncver);
|
||||
if (written == synced) {
|
||||
// clear local hint, since not write happens after sync
|
||||
hintLength = meta::VersionedLength{0, 0};
|
||||
}
|
||||
if (fsync) {
|
||||
fsynced = std::max(fsynced, syncver);
|
||||
}
|
||||
truncateVer = std::max(truncateVer, inode.asFile().truncateVer);
|
||||
dynStripe = inode.asFile().dynStripe;
|
||||
}
|
||||
};
|
||||
|
||||
Inode inode;
|
||||
int refcount;
|
||||
std::atomic<int> opened;
|
||||
|
||||
std::mutex wbMtx;
|
||||
std::shared_ptr<InodeWriteBuf> writeBuf;
|
||||
|
||||
folly::Synchronized<DynamicAttr> dynamicAttr;
|
||||
folly::coro::Mutex extendStripeLock;
|
||||
|
||||
RcInode(Inode inode, int refcount = 1)
|
||||
: inode(inode),
|
||||
refcount(refcount),
|
||||
extendStripeLock() {
|
||||
if (inode.isFile()) {
|
||||
auto guard = dynamicAttr.wlock();
|
||||
guard->truncateVer = inode.asFile().truncateVer;
|
||||
guard->hintLength = meta::VersionedLength{0, guard->truncateVer};
|
||||
guard->dynStripe = inode.asFile().dynStripe;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t getTruncateVer() const { return dynamicAttr.rlock()->truncateVer; }
|
||||
|
||||
void update(const Inode &inode, uint64_t syncver = 0, bool fsync = false) {
|
||||
if (!inode.isFile()) {
|
||||
return;
|
||||
} else {
|
||||
auto guard = dynamicAttr.wlock();
|
||||
return guard->update(inode, syncver, fsync);
|
||||
}
|
||||
}
|
||||
|
||||
// clear hint length, force calculate length on next sync
|
||||
void clearHintLength() {
|
||||
auto guard = dynamicAttr.wlock();
|
||||
guard->hintLength = std::nullopt;
|
||||
}
|
||||
|
||||
CoTryTask<uint64_t> beginWrite(flat::UserInfo userInfo,
|
||||
meta::client::MetaClient &meta,
|
||||
uint64_t offset,
|
||||
uint64_t length);
|
||||
|
||||
void finishWrite(flat::UserInfo userInfo, uint64_t truncateVer, uint64_t offset, ssize_t ret);
|
||||
};
|
||||
|
||||
struct FileHandle {
|
||||
std::shared_ptr<RcInode> rcinode;
|
||||
bool oDirect;
|
||||
Uuid sessionId;
|
||||
|
||||
/* FileHandle(std::shared_ptr<RcInode> rcinode, bool oDirect, Uuid sessionId) */
|
||||
/* : rcinode(rcinode), */
|
||||
/* sessionId(sessionId) {} */
|
||||
};
|
||||
|
||||
struct DirHandle {
|
||||
size_t dirId;
|
||||
pid_t pid;
|
||||
bool iovDir;
|
||||
};
|
||||
|
||||
struct DirEntryVector {
|
||||
std::shared_ptr<std::vector<DirEntry>> dirEntries;
|
||||
|
||||
DirEntryVector(std::shared_ptr<std::vector<DirEntry>> &&dirEntries)
|
||||
: dirEntries(std::move(dirEntries)) {}
|
||||
};
|
||||
|
||||
struct DirEntryInodeVector {
|
||||
std::shared_ptr<std::vector<DirEntry>> dirEntries;
|
||||
std::shared_ptr<std::vector<std::optional<Inode>>> inodes;
|
||||
|
||||
DirEntryInodeVector(std::shared_ptr<std::vector<DirEntry>> dirEntries,
|
||||
std::shared_ptr<std::vector<std::optional<Inode>>> inodes)
|
||||
: dirEntries(std::move(dirEntries)),
|
||||
inodes(std::move(inodes)) {}
|
||||
};
|
||||
|
||||
struct FuseClients {
|
||||
FuseClients() = default;
|
||||
~FuseClients();
|
||||
|
||||
Result<Void> init(const flat::AppInfo &appInfo,
|
||||
const String &mountPoint,
|
||||
const String &tokenFile,
|
||||
FuseConfig &fuseConfig);
|
||||
void stop();
|
||||
|
||||
CoTask<void> ioRingWorker(int i, int ths);
|
||||
void watch(int prio, std::stop_token stop);
|
||||
|
||||
CoTask<void> periodicSyncScan();
|
||||
CoTask<void> periodicSync(InodeId inodeId);
|
||||
|
||||
std::unique_ptr<net::Client> client;
|
||||
std::shared_ptr<client::MgmtdClientForClient> mgmtdClient;
|
||||
std::shared_ptr<storage::client::StorageClient> storageClient;
|
||||
std::shared_ptr<meta::client::MetaClient> metaClient;
|
||||
|
||||
std::string fuseToken;
|
||||
std::string fuseMount;
|
||||
Path fuseMountpoint;
|
||||
std::optional<Path> fuseRemountPref;
|
||||
std::atomic<bool> memsetBeforeRead = false;
|
||||
int maxIdleThreads = 0;
|
||||
int maxThreads = 0;
|
||||
bool enableWritebackCache = false;
|
||||
|
||||
std::unique_ptr<ConfigCallbackGuard> onFuseConfigUpdated;
|
||||
|
||||
std::unordered_map<InodeId, std::shared_ptr<RcInode>> inodes = {
|
||||
{InodeId::root(), std::make_shared<RcInode>(Inode{}, 2)}};
|
||||
std::mutex inodesMutex;
|
||||
|
||||
std::unordered_map<uint64_t, DirEntryInodeVector> readdirplusResults;
|
||||
std::mutex readdirplusResultsMutex;
|
||||
|
||||
std::atomic_uint64_t dirHandle{0};
|
||||
|
||||
std::shared_ptr<net::RDMABufPool> bufPool;
|
||||
int maxBufsize = 0;
|
||||
|
||||
fuse_session *se = nullptr;
|
||||
|
||||
std::atomic<std::chrono::nanoseconds> jitter;
|
||||
|
||||
IovTable iovs;
|
||||
IoRingTable iors;
|
||||
std::vector<std::unique_ptr<BoundedQueue<IoRingJob>>> iojqs; // job queues
|
||||
std::vector<std::jthread> ioWatches;
|
||||
folly::CancellationSource cancelIos;
|
||||
|
||||
UserConfig userConfig;
|
||||
|
||||
folly::Synchronized<std::set<InodeId>, std::mutex> dirtyInodes;
|
||||
std::atomic<InodeId> lastSynced;
|
||||
std::unique_ptr<BackgroundRunner> periodicSyncRunner;
|
||||
std::unique_ptr<CoroutinesPool<InodeId>> periodicSyncWorker;
|
||||
|
||||
std::unique_ptr<folly::IOThreadPoolExecutor> notifyInvalExec;
|
||||
const FuseConfig *config;
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
92
src/fuse/FuseConfig.h
Normal file
92
src/fuse/FuseConfig.h
Normal file
@@ -0,0 +1,92 @@
|
||||
#pragma once
|
||||
|
||||
#include "client/meta/MetaClient.h"
|
||||
#include "client/mgmtd/MgmtdClientForClient.h"
|
||||
#include "client/storage/StorageClient.h"
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/CoroutinesPool.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
struct FuseConfig : public ConfigBase<FuseConfig> {
|
||||
#ifdef ENABLE_FUSE_APPLICATION
|
||||
CONFIG_OBJ(common, ApplicationBase::Config);
|
||||
#else
|
||||
CONFIG_ITEM(cluster_id, "");
|
||||
CONFIG_ITEM(token_file, "");
|
||||
CONFIG_ITEM(mountpoint, "");
|
||||
CONFIG_ITEM(allow_other, true);
|
||||
CONFIG_OBJ(ib_devices, net::IBDevice::Config);
|
||||
CONFIG_OBJ(log, logging::LogConfig);
|
||||
CONFIG_OBJ(monitor, monitor::Monitor::Config);
|
||||
#endif
|
||||
CONFIG_HOT_UPDATED_ITEM(enable_priority, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(enable_interrupt, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(attr_timeout, (double)30);
|
||||
CONFIG_HOT_UPDATED_ITEM(entry_timeout, (double)30);
|
||||
CONFIG_HOT_UPDATED_ITEM(negative_timeout, (double)5);
|
||||
CONFIG_HOT_UPDATED_ITEM(symlink_timeout, (double)5);
|
||||
CONFIG_HOT_UPDATED_ITEM(readonly, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(memset_before_read, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(enable_read_cache, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(fsync_length_hint, false); // for test
|
||||
CONFIG_HOT_UPDATED_ITEM(fdatasync_update_length, false);
|
||||
CONFIG_ITEM(max_idle_threads, 10);
|
||||
CONFIG_ITEM(max_threads, 256);
|
||||
CONFIG_ITEM(max_readahead, 16_MB);
|
||||
CONFIG_ITEM(max_background, 32);
|
||||
CONFIG_ITEM(enable_writeback_cache, false);
|
||||
CONFIG_OBJ(client, net::Client::Config);
|
||||
CONFIG_OBJ(mgmtd, client::MgmtdClientForClient::Config);
|
||||
CONFIG_OBJ(storage, storage::client::StorageClient::Config);
|
||||
CONFIG_OBJ(meta, meta::client::MetaClient::Config, [&](auto &cfg) { cfg.set_dynamic_stripe(true); });
|
||||
CONFIG_ITEM(remount_prefix, (std::optional<std::string>)std::nullopt);
|
||||
CONFIG_ITEM(iov_limit, 1_MB);
|
||||
CONFIG_ITEM(io_jobq_size, 1024);
|
||||
CONFIG_ITEM(batch_io_coros, 128);
|
||||
CONFIG_ITEM(rdma_buf_pool_size, 1024);
|
||||
CONFIG_ITEM(time_granularity, 1_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(check_rmrf, true);
|
||||
CONFIG_ITEM(notify_inval_threads, 32);
|
||||
|
||||
CONFIG_ITEM(max_uid, 1_M);
|
||||
|
||||
CONFIG_HOT_UPDATED_ITEM(chunk_size_limit, 0_KB);
|
||||
|
||||
CONFIG_SECT(io_jobq_sizes, {
|
||||
CONFIG_ITEM(hi, 32);
|
||||
CONFIG_ITEM(lo, 4096);
|
||||
});
|
||||
|
||||
CONFIG_SECT(io_worker_coros, {
|
||||
CONFIG_HOT_UPDATED_ITEM(hi, 8);
|
||||
CONFIG_HOT_UPDATED_ITEM(lo, 8);
|
||||
});
|
||||
|
||||
CONFIG_HOT_UPDATED_ITEM(io_job_deq_timeout, 1_ms);
|
||||
|
||||
CONFIG_OBJ(storage_io, storage::client::IoOptions);
|
||||
|
||||
CONFIG_HOT_UPDATED_ITEM(submit_wait_jitter, 1_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_jobs_per_ioring, 32);
|
||||
|
||||
CONFIG_SECT(io_bufs, {
|
||||
CONFIG_ITEM(max_buf_size, 1_MB);
|
||||
CONFIG_ITEM(max_readahead, 256_KB);
|
||||
CONFIG_ITEM(write_buf_size, 1_MB);
|
||||
});
|
||||
|
||||
CONFIG_HOT_UPDATED_ITEM(flush_on_stat, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(sync_on_stat, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(dryrun_bench_mode, false);
|
||||
|
||||
struct PeriodSync : public ConfigBase<PeriodSync> {
|
||||
CONFIG_HOT_UPDATED_ITEM(enable, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(interval, 30_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(limit, 1000u);
|
||||
CONFIG_HOT_UPDATED_ITEM(flush_write_buf, true);
|
||||
CONFIG_OBJ(worker, CoroutinesPoolBase::Config, [](auto &cfg) { cfg.set_coroutines_num(4); });
|
||||
};
|
||||
CONFIG_OBJ(periodic_sync, PeriodSync);
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
19
src/fuse/FuseConfigFetcher.cc
Normal file
19
src/fuse/FuseConfigFetcher.cc
Normal file
@@ -0,0 +1,19 @@
|
||||
#include "FuseConfigFetcher.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
|
||||
#include "common/utils/SysResource.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
Result<Void> FuseConfigFetcher::completeAppInfo(flat::AppInfo &appInfo [[maybe_unused]]) {
|
||||
auto hostnameRes = SysResource::hostname(/*physicalMachineName=*/true);
|
||||
RETURN_ON_ERROR(hostnameRes);
|
||||
RETURN_ON_ERROR(ensureClientInited());
|
||||
return folly::coro::blockingWait([&]() -> CoTryTask<void> {
|
||||
auto tagsRes = co_await mgmtdClient_->getUniversalTags(*hostnameRes);
|
||||
CO_RETURN_ON_ERROR(tagsRes);
|
||||
appInfo.tags = std::move(*tagsRes);
|
||||
co_return Void{};
|
||||
}());
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
10
src/fuse/FuseConfigFetcher.h
Normal file
10
src/fuse/FuseConfigFetcher.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include "core/app/MgmtdClientFetcher.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
struct FuseConfigFetcher : public core::launcher::MgmtdClientFetcher {
|
||||
using core::launcher::MgmtdClientFetcher::MgmtdClientFetcher;
|
||||
Result<Void> completeAppInfo(flat::AppInfo &appInfo) final;
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
10
src/fuse/FuseLauncherConfig.cc
Normal file
10
src/fuse/FuseLauncherConfig.cc
Normal file
@@ -0,0 +1,10 @@
|
||||
#include "FuseLauncherConfig.h"
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/app/Utils.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
void FuseLauncherConfig::init(const String &filePath, bool dump, const std::vector<config::KeyValue> &updates) {
|
||||
app_detail::initConfigFromFile(*this, filePath, dump, updates);
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
24
src/fuse/FuseLauncherConfig.h
Normal file
24
src/fuse/FuseLauncherConfig.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#include "client/mgmtd/MgmtdClientForClient.h"
|
||||
#include "common/app/NodeId.h"
|
||||
#include "common/net/Client.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
struct FuseLauncherConfig : public ConfigBase<FuseLauncherConfig> {
|
||||
CONFIG_ITEM(cluster_id, "");
|
||||
CONFIG_OBJ(ib_devices, net::IBDevice::Config);
|
||||
CONFIG_OBJ(client, net::Client::Config);
|
||||
CONFIG_OBJ(mgmtd_client, client::MgmtdClientForClient::Config);
|
||||
CONFIG_ITEM(mountpoint, "");
|
||||
CONFIG_ITEM(allow_other, true);
|
||||
CONFIG_ITEM(token_file, "");
|
||||
|
||||
public:
|
||||
using Base = ConfigBase<FuseLauncherConfig>;
|
||||
using Base::init;
|
||||
|
||||
void init(const String &filePath, bool dump, const std::vector<config::KeyValue> &updates);
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
107
src/fuse/FuseMainLoop.cc
Normal file
107
src/fuse/FuseMainLoop.cc
Normal file
@@ -0,0 +1,107 @@
|
||||
#include "FuseMainLoop.h"
|
||||
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "FuseOps.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
int fuseMainLoop(const String &programName,
|
||||
bool allowOther,
|
||||
const String &mountpoint,
|
||||
size_t maxbufsize,
|
||||
const String &clusterId) {
|
||||
auto &d = getFuseClientsInstance();
|
||||
const auto &ops = getFuseOps();
|
||||
|
||||
std::stack<std::function<void()>> onStopHooks;
|
||||
SCOPE_EXIT {
|
||||
while (!onStopHooks.empty()) {
|
||||
onStopHooks.top()();
|
||||
onStopHooks.pop();
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::string> fuseArgs;
|
||||
fuseArgs.push_back(programName);
|
||||
if (allowOther) {
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back("allow_other");
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back("default_permissions");
|
||||
}
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back("auto_unmount");
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back(fmt::format("max_read={}", maxbufsize));
|
||||
fuseArgs.push_back(mountpoint);
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back("subtype=hf3fs");
|
||||
fuseArgs.push_back("-o");
|
||||
fuseArgs.push_back("fsname=hf3fs." + clusterId);
|
||||
std::vector<char *> fuseArgsPtr;
|
||||
for (auto &arg : fuseArgs) {
|
||||
fuseArgsPtr.push_back(const_cast<char *>(arg.c_str()));
|
||||
}
|
||||
|
||||
struct fuse_args args = FUSE_ARGS_INIT((int)fuseArgsPtr.size(), fuseArgsPtr.data());
|
||||
// struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
|
||||
struct fuse_cmdline_opts opts;
|
||||
struct fuse_loop_config *config = fuse_loop_cfg_create();
|
||||
SCOPE_EXIT { fuse_loop_cfg_destroy(config); };
|
||||
|
||||
if (fuse_parse_cmdline(&args, &opts) != 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
onStopHooks.push([&] {
|
||||
free(opts.mountpoint);
|
||||
fuse_opt_free_args(&args);
|
||||
});
|
||||
|
||||
if (opts.show_help) {
|
||||
printf("This is hf3fs fuse!\n");
|
||||
fuse_cmdline_help();
|
||||
fuse_lowlevel_help();
|
||||
return 0;
|
||||
} else if (opts.show_version) {
|
||||
printf("What's my version?\n");
|
||||
fuse_lowlevel_version();
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (opts.mountpoint == nullptr) {
|
||||
printf("No mountpoint.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
d.se = fuse_session_new(&args, &ops, sizeof(ops), NULL);
|
||||
if (d.se == nullptr) {
|
||||
return 1;
|
||||
}
|
||||
onStopHooks.push([&] { fuse_session_destroy(d.se); });
|
||||
|
||||
if (fuse_set_signal_handlers(d.se) != 0) {
|
||||
return 1;
|
||||
}
|
||||
onStopHooks.push([&] { fuse_remove_signal_handlers(d.se); });
|
||||
|
||||
if (fuse_session_mount(d.se, opts.mountpoint) != 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
onStopHooks.push([&] { fuse_session_unmount(d.se); });
|
||||
|
||||
int ret = -1;
|
||||
if (opts.singlethread) {
|
||||
ret = fuse_session_loop(d.se);
|
||||
} else {
|
||||
fuse_loop_cfg_set_clone_fd(config, opts.clone_fd);
|
||||
fuse_loop_cfg_set_idle_threads(config, d.maxIdleThreads);
|
||||
fuse_loop_cfg_set_max_threads(config, d.maxThreads);
|
||||
ret = fuse_session_loop_mt(d.se, config);
|
||||
}
|
||||
|
||||
return ret ? 1 : 0;
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
11
src/fuse/FuseMainLoop.h
Normal file
11
src/fuse/FuseMainLoop.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/utils/String.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
int fuseMainLoop(const String &programName,
|
||||
bool allowOther,
|
||||
const String &mountpoint,
|
||||
size_t maxbufsize,
|
||||
const String &clusterId);
|
||||
}
|
||||
2715
src/fuse/FuseOps.cc
Normal file
2715
src/fuse/FuseOps.cc
Normal file
File diff suppressed because it is too large
Load Diff
8
src/fuse/FuseOps.h
Normal file
8
src/fuse/FuseOps.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "FuseClients.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
FuseClients &getFuseClientsInstance();
|
||||
const fuse_lowlevel_ops &getFuseOps();
|
||||
} // namespace hf3fs::fuse
|
||||
285
src/fuse/IoRing.cc
Normal file
285
src/fuse/IoRing.cc
Normal file
@@ -0,0 +1,285 @@
|
||||
#include "IoRing.h"
|
||||
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "PioV.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/meta/Schema.h"
|
||||
#include "fuse/FuseClients.h"
|
||||
#include "fuse/FuseOps.h"
|
||||
#include "lib/api/hf3fs_usrbio.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
std::vector<IoRingJob> IoRing::jobsToProc(int maxJobs) {
|
||||
std::vector<IoRingJob> jobs;
|
||||
|
||||
std::lock_guard lock(cqeMtx_);
|
||||
auto spt = sqeProcTail_;
|
||||
auto sqes = sqeCount();
|
||||
|
||||
auto cqeAvail = entries - 1 - processing_ - cqeCount();
|
||||
while (sqes && (int)jobs.size() < maxJobs) {
|
||||
int toProc;
|
||||
if (ioDepth > 0) {
|
||||
toProc = ioDepth;
|
||||
if (toProc > sqes || toProc > cqeAvail) { // even if we finish the io, we got no place to store the results
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
toProc = std::min(sqes, cqeAvail);
|
||||
if (ioDepth < 0) {
|
||||
auto iod = -ioDepth;
|
||||
if (toProc > iod) {
|
||||
toProc = iod;
|
||||
} else if (toProc < iod && timeout.count()) {
|
||||
auto now = lastCheck_ = SteadyClock::now();
|
||||
if (!lastCheck_) { // first time to find the (not enough) ios, wait till timeout
|
||||
lastCheck_ = now;
|
||||
break;
|
||||
} else if (*lastCheck_ + timeout > now) { // ios not enough to fill a batch, and time has not run out
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lastCheck_ = std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
if (jobs.empty()) {
|
||||
jobs.reserve(ioDepth ? std::min(maxJobs, sqes / abs(ioDepth) + 1) : 1);
|
||||
}
|
||||
|
||||
jobs.emplace_back(IoRingJob{shared_from_this(), spt, toProc});
|
||||
|
||||
spt = (spt + toProc) % entries;
|
||||
sqeProcTails_.push_back(spt);
|
||||
processing_ += toProc;
|
||||
sqes -= toProc;
|
||||
cqeAvail -= toProc;
|
||||
}
|
||||
|
||||
sqeProcTail_ = spt;
|
||||
return jobs;
|
||||
}
|
||||
|
||||
CoTask<void> IoRing::process(
|
||||
int spt,
|
||||
int toProc,
|
||||
storage::client::StorageClient &storageClient,
|
||||
const storage::client::IoOptions &storageIo,
|
||||
UserConfig &userConfig,
|
||||
std::function<void(std::vector<std::shared_ptr<RcInode>> &, const IoArgs *, const IoSqe *, int)> &&lookupFiles,
|
||||
std::function<void(std::vector<Result<lib::ShmBufForIO>> &, const IoArgs *, const IoSqe *, int)> &&lookupBufs) {
|
||||
static monitor::LatencyRecorder overallLatency("usrbio.piov.overall", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::LatencyRecorder prepareLatency("usrbio.piov.prepare", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::LatencyRecorder submitLatency("usrbio.piov.submit", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::LatencyRecorder completeLatency("usrbio.piov.complete", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder ioSizeDist("usrbio.piov.io_size", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder ioDepthDist("usrbio.piov.io_depth", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder totalBytesDist("usrbio.piov.total_bytes",
|
||||
monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder distinctFilesDist("usrbio.piov.distinct_files",
|
||||
monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder distinctBufsDist("usrbio.piov.distinct_bufs",
|
||||
monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::CountRecorder bwCount("usrbio.piov.bw", monitor::TagSet{{"mount_name", mountName}});
|
||||
|
||||
auto start = SteadyClock::now(), overallStart = start;
|
||||
std::string ioType = forRead_ ? "read" : "write";
|
||||
auto uids = std::to_string(userInfo_.uid.toUnderType());
|
||||
|
||||
auto &config = userConfig.getConfig(userInfo_);
|
||||
|
||||
std::vector<ssize_t> res;
|
||||
if (!forRead_ && config.readonly()) {
|
||||
res = std::vector<ssize_t>(toProc, static_cast<ssize_t>(-StatusCode::kReadOnlyMode));
|
||||
} else {
|
||||
res = std::vector<ssize_t>(toProc, 0);
|
||||
|
||||
size_t iod = 0, totalBytes = 0;
|
||||
std::set<uint64_t> distinctFiles;
|
||||
std::set<Uuid> distinctBufs;
|
||||
|
||||
std::vector<std::shared_ptr<RcInode>> inodes;
|
||||
inodes.reserve(toProc);
|
||||
lookupFiles(inodes, ringSection, sqeSection + spt, std::min(toProc, entries - spt));
|
||||
if ((int)inodes.size() < toProc) {
|
||||
lookupFiles(inodes, ringSection, sqeSection, toProc - (int)inodes.size());
|
||||
}
|
||||
|
||||
std::vector<Result<lib::ShmBufForIO>> bufs;
|
||||
bufs.reserve(toProc);
|
||||
lookupBufs(bufs, ringSection, sqeSection + spt, std::min(toProc, entries - spt));
|
||||
if ((int)bufs.size() < toProc) {
|
||||
lookupBufs(bufs, ringSection, sqeSection, toProc - (int)bufs.size());
|
||||
}
|
||||
|
||||
lib::agent::PioV ioExec(storageClient, config.chunk_size_limit(), res);
|
||||
std::vector<uint64_t> truncateVers;
|
||||
if (!forRead_) {
|
||||
truncateVers.resize(toProc, 0);
|
||||
}
|
||||
|
||||
for (int i = 0; i < toProc; ++i) {
|
||||
auto idx = (spt + i) % entries;
|
||||
auto sqe = sqeSection[idx];
|
||||
|
||||
const auto &args = ringSection[sqe.index];
|
||||
|
||||
++iod;
|
||||
totalBytes += args.ioLen;
|
||||
distinctFiles.insert(args.fileIid);
|
||||
|
||||
Uuid id;
|
||||
memcpy(id.data, args.bufId, sizeof(id.data));
|
||||
distinctBufs.insert(id);
|
||||
|
||||
ioSizeDist.addSample(args.ioLen, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
|
||||
if (!inodes[i]) {
|
||||
res[i] = -static_cast<ssize_t>(MetaCode::kNotFile);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!bufs[i]) {
|
||||
res[i] = -static_cast<ssize_t>(bufs[i].error().code());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto memh = co_await bufs[i]->memh(args.ioLen);
|
||||
if (!memh) {
|
||||
res[i] = -static_cast<ssize_t>(memh.error().code());
|
||||
continue;
|
||||
} else if (!bufs[i]->ptr() || !*memh) {
|
||||
XLOGF(ERR, "{} is null when doing usrbio", *memh ? "buf ptr" : "memh");
|
||||
res[i] = -static_cast<ssize_t>(ClientAgentCode::kIovShmFail);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!forRead_) {
|
||||
auto beginWrite =
|
||||
co_await inodes[i]->beginWrite(userInfo_, *getFuseClientsInstance().metaClient, args.fileOff, args.ioLen);
|
||||
if (beginWrite.hasError()) {
|
||||
res[i] = -static_cast<ssize_t>(beginWrite.error().code());
|
||||
continue;
|
||||
}
|
||||
truncateVers[i] = *beginWrite;
|
||||
}
|
||||
|
||||
auto addRes = forRead_
|
||||
? ioExec.addRead(i, inodes[i]->inode, 0, args.fileOff, args.ioLen, bufs[i]->ptr(), **memh)
|
||||
: ioExec.addWrite(i, inodes[i]->inode, 0, args.fileOff, args.ioLen, bufs[i]->ptr(), **memh);
|
||||
if (!addRes) {
|
||||
res[i] = -static_cast<ssize_t>(addRes.error().code());
|
||||
}
|
||||
}
|
||||
|
||||
auto now = SteadyClock::now();
|
||||
prepareLatency.addSample(now - start, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
start = now;
|
||||
|
||||
ioDepthDist.addSample(iod, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
totalBytesDist.addSample(totalBytes, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
distinctFilesDist.addSample(distinctFiles.size(), monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
distinctBufsDist.addSample(distinctBufs.size(), monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
|
||||
auto readOpt = storageIo.read();
|
||||
if (flags_ & HF3FS_IOR_ALLOW_READ_UNCOMMITTED) {
|
||||
readOpt.set_allowReadUncommitted(true);
|
||||
}
|
||||
auto execRes = co_await (forRead_ ? ioExec.executeRead(userInfo_, readOpt)
|
||||
: ioExec.executeWrite(userInfo_, storageIo.write()));
|
||||
|
||||
now = SteadyClock::now();
|
||||
submitLatency.addSample(now - start, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
start = now;
|
||||
|
||||
if (!execRes) {
|
||||
for (auto &r : res) {
|
||||
if (r >= 0) {
|
||||
r = -static_cast<ssize_t>(execRes.error().code());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ioExec.finishIo(!(flags_ & HF3FS_IOR_FORBID_READ_HOLES));
|
||||
}
|
||||
|
||||
if (!forRead_) {
|
||||
for (int i = 0; i < toProc; ++i) {
|
||||
auto &inode = inodes[i];
|
||||
if (!inode) {
|
||||
continue;
|
||||
}
|
||||
auto sqe = sqeSection[(spt + i) % entries];
|
||||
auto off = ringSection[sqe.index].fileOff;
|
||||
auto r = res[i];
|
||||
inode->finishWrite(userInfo_.uid, truncateVers[i], off, r);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto newSpt = (spt + toProc) % entries;
|
||||
|
||||
std::vector<IoSqe> sqes(toProc);
|
||||
for (int i = 0; i < toProc; ++i) {
|
||||
sqes[i] = sqeSection[(spt + i) % entries];
|
||||
}
|
||||
|
||||
{
|
||||
// lock for between threads (io workers)
|
||||
// atomics for between processes (io worker & io generator)
|
||||
std::lock_guard lock(cqeMtx_);
|
||||
if (sqeProcTails_.empty()) {
|
||||
XLOGF(FATAL, "bug?! sqeProcTails_ is empty");
|
||||
}
|
||||
|
||||
if (sqeProcTails_.front() != newSpt) {
|
||||
sqeDoneTails_.insert(newSpt);
|
||||
} else {
|
||||
sqeTail = newSpt;
|
||||
sqeProcTails_.pop_front();
|
||||
while (!sqeDoneTails_.empty()) {
|
||||
if (sqeProcTails_.empty()) {
|
||||
XLOGF(FATAL, "bug?! sqeProcTails_ is empty");
|
||||
}
|
||||
auto first = sqeProcTails_.front();
|
||||
auto it = sqeDoneTails_.find(first);
|
||||
if (it == sqeDoneTails_.end()) {
|
||||
break;
|
||||
} else {
|
||||
sqeTail = first;
|
||||
sqeProcTails_.pop_front();
|
||||
sqeDoneTails_.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < toProc; ++i) {
|
||||
auto &sqe = sqes[i];
|
||||
auto r = res[i];
|
||||
auto addRes = addCqe(sqe.index, r >= 0 ? r : -static_cast<ssize_t>(StatusCode::toErrno(-r)), sqe.userdata);
|
||||
if (!addRes) {
|
||||
XLOGF(FATAL, "failed to add cqe");
|
||||
}
|
||||
}
|
||||
|
||||
processing_ -= toProc;
|
||||
}
|
||||
|
||||
sem_post(cqeSem.get());
|
||||
|
||||
size_t doneBytes = 0;
|
||||
for (auto r : res) {
|
||||
if (r > 0) {
|
||||
doneBytes += r;
|
||||
}
|
||||
}
|
||||
bwCount.addSample(doneBytes, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
|
||||
auto now = SteadyClock::now();
|
||||
completeLatency.addSample(now - start, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
overallLatency.addSample(now - overallStart, monitor::TagSet{{"io", ioType}, {"uid", uids}});
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
279
src/fuse/IoRing.h
Normal file
279
src/fuse/IoRing.h
Normal file
@@ -0,0 +1,279 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <semaphore.h>
|
||||
|
||||
#include "IovTable.h"
|
||||
#include "UserConfig.h"
|
||||
#include "client/storage/StorageClient.h"
|
||||
#include "common/utils/AtomicSharedPtrTable.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/Uuid.h"
|
||||
#include "fbs/meta/Schema.h"
|
||||
#include "lib/common/Shm.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
struct RcInode;
|
||||
struct IoArgs {
|
||||
uint8_t bufId[16];
|
||||
size_t bufOff;
|
||||
|
||||
uint64_t fileIid;
|
||||
size_t fileOff;
|
||||
|
||||
uint64_t ioLen;
|
||||
|
||||
const void *userdata;
|
||||
};
|
||||
|
||||
struct IoSqe {
|
||||
int32_t index;
|
||||
const void *userdata;
|
||||
};
|
||||
|
||||
struct IoCqe {
|
||||
int32_t index;
|
||||
int32_t reserved;
|
||||
int64_t result;
|
||||
const void *userdata;
|
||||
};
|
||||
|
||||
class IoRing;
|
||||
|
||||
struct IoRingJob {
|
||||
std::shared_ptr<IoRing> ior;
|
||||
int sqeProcTail;
|
||||
int toProc;
|
||||
};
|
||||
|
||||
// we allow multiple io workers to process the same ioring, but different ranges
|
||||
// so 1 ioring can be used to submit ios processed in parallel
|
||||
// howoever, we don't allow multiple threads to prepare ios in the same ioring
|
||||
// or batches may be mixed and things may get ugly
|
||||
class IoRing : public std::enable_shared_from_this<IoRing> {
|
||||
public:
|
||||
static int ringMarkerSize() {
|
||||
auto n = std::atomic_ref<int32_t>::required_alignment;
|
||||
return (4 + n - 1) / n * n;
|
||||
}
|
||||
// allocate 1 more slot for queue emptiness/fullness checking
|
||||
static int ioRingEntries(size_t bufSize) {
|
||||
auto n = ringMarkerSize();
|
||||
// n * 4 for sqe/cqe head/tail markers
|
||||
return (int)std::min((size_t)std::numeric_limits<int>::max(),
|
||||
(bufSize - 4096 - n * 4 - sizeof(sem_t)) / (sizeof(IoArgs) + sizeof(IoCqe) + sizeof(IoSqe))) -
|
||||
1;
|
||||
}
|
||||
static size_t bytesRequired(int entries) {
|
||||
auto n = ringMarkerSize();
|
||||
// n * 4 for sqe/cqe head/tail markers
|
||||
return n * 4 + sizeof(sem_t) + (sizeof(IoArgs) + sizeof(IoCqe) + sizeof(IoSqe)) * (entries + 1) + 4096;
|
||||
}
|
||||
|
||||
public:
|
||||
using std::enable_shared_from_this<IoRing>::shared_from_this;
|
||||
|
||||
// the shm arg is used to keep it from being destroyed when the iov link is removed
|
||||
IoRing(std::shared_ptr<lib::ShmBuf> shm,
|
||||
std::string_view nm,
|
||||
const meta::UserInfo &ui,
|
||||
bool read,
|
||||
uint8_t *buf,
|
||||
size_t size,
|
||||
int iod,
|
||||
int prio,
|
||||
Duration to,
|
||||
uint64_t flags,
|
||||
bool owner = true)
|
||||
: name(nm),
|
||||
entries(ioRingEntries(size) + 1),
|
||||
ioDepth(iod),
|
||||
priority(prio),
|
||||
timeout(to),
|
||||
sqeHead_((int32_t *)buf),
|
||||
sqeTail_((int32_t *)(buf + ringMarkerSize())),
|
||||
cqeHead_((int32_t *)(buf + ringMarkerSize() * 2)),
|
||||
cqeTail_((int32_t *)(buf + ringMarkerSize() * 3)),
|
||||
sqeHead(*sqeHead_),
|
||||
sqeTail(*sqeTail_),
|
||||
cqeHead(*cqeHead_),
|
||||
cqeTail(*cqeTail_),
|
||||
ringSection((IoArgs *)(buf + ringMarkerSize() * 4)),
|
||||
cqeSection((IoCqe *)(ringSection + entries)),
|
||||
sqeSection((IoSqe *)(cqeSection + entries)),
|
||||
slots(entries - 1),
|
||||
shm_(std::move(shm)),
|
||||
userInfo_(ui),
|
||||
forRead_(read),
|
||||
flags_(flags) {
|
||||
XLOGF_IF(FATAL,
|
||||
(uintptr_t)(sqeSection + entries + sizeof(sem_t)) > (uintptr_t)(buf + size),
|
||||
"sem has a bad address {}, after whole shm starts at {} with {} bytes",
|
||||
(void *)(sqeSection + entries + sizeof(sem_t)),
|
||||
(void *)buf,
|
||||
size);
|
||||
auto sem = (sem_t *)(sqeSection + entries);
|
||||
if (owner) {
|
||||
sem_init(sem, 1, 0);
|
||||
}
|
||||
cqeSem.reset(sem);
|
||||
}
|
||||
std::vector<IoRingJob> jobsToProc(int maxJobs);
|
||||
int cqeCount() const { return (cqeHead.load() + entries - cqeTail.load()) % entries; }
|
||||
CoTask<void> process(
|
||||
int spt,
|
||||
int toProc,
|
||||
storage::client::StorageClient &storageClient,
|
||||
const storage::client::IoOptions &storageIo,
|
||||
UserConfig &userConfig,
|
||||
std::function<void(std::vector<std::shared_ptr<RcInode>> &, const IoArgs *, const IoSqe *, int)> &&lookupFiles,
|
||||
std::function<void(std::vector<Result<lib::ShmBufForIO>> &, const IoArgs *, const IoSqe *, int)> &&lookupBufs);
|
||||
|
||||
public:
|
||||
bool addSqe(int idx, const void *userdata) {
|
||||
auto h = sqeHead.load();
|
||||
if ((h + 1) % entries == sqeTail.load()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &sqe = sqeSection[h];
|
||||
sqe.index = idx;
|
||||
sqe.userdata = userdata;
|
||||
|
||||
sqeHead.store((h + 1) % entries);
|
||||
|
||||
return true;
|
||||
}
|
||||
bool sqeTailAfter(int a, int b) {
|
||||
auto h = sqeHead.load();
|
||||
if (a == h) { // caught up with head, must be the last
|
||||
return true;
|
||||
}
|
||||
auto ah = a > h, bh = b > h;
|
||||
if (ah == bh) { // both after or before head, bigger is after
|
||||
return a > b;
|
||||
} else { // the one before head is after
|
||||
return bh;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
std::string name;
|
||||
std::string mountName;
|
||||
int entries;
|
||||
int ioDepth;
|
||||
int priority;
|
||||
Duration timeout;
|
||||
|
||||
private:
|
||||
int32_t *sqeHead_;
|
||||
int32_t *sqeTail_;
|
||||
int32_t *cqeHead_;
|
||||
int32_t *cqeTail_;
|
||||
std::optional<SteadyTime> lastCheck_;
|
||||
|
||||
public:
|
||||
std::atomic_ref<int32_t> sqeHead;
|
||||
std::atomic_ref<int32_t> sqeTail;
|
||||
std::atomic_ref<int32_t> cqeHead;
|
||||
std::atomic_ref<int32_t> cqeTail;
|
||||
IoArgs *ringSection;
|
||||
IoCqe *cqeSection;
|
||||
IoSqe *sqeSection;
|
||||
std::unique_ptr<sem_t, std::function<void(sem_t *)>> cqeSem{nullptr, [](sem_t *p) { sem_destroy(p); }};
|
||||
|
||||
public:
|
||||
AvailSlots slots;
|
||||
|
||||
private:
|
||||
int sqeCount() const { return (sqeHead.load() + entries - sqeProcTail_) % entries; }
|
||||
[[nodiscard]] bool addCqe(int idx, ssize_t res, const void *userdata) {
|
||||
auto h = cqeHead.load();
|
||||
if ((h + 1) % entries == cqeTail.load()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto &cqe = cqeSection[h];
|
||||
cqe.index = idx;
|
||||
cqe.result = res;
|
||||
cqe.userdata = userdata;
|
||||
|
||||
cqeHead.store((h + 1) % entries);
|
||||
return true;
|
||||
}
|
||||
|
||||
private: // for fuse
|
||||
std::shared_ptr<lib::ShmBuf> shm_;
|
||||
meta::UserInfo userInfo_;
|
||||
bool forRead_;
|
||||
uint64_t flags_;
|
||||
std::mutex cqeMtx_; // when reporting cqes
|
||||
int sqeProcTail_{0};
|
||||
int processing_{0};
|
||||
std::deque<int> sqeProcTails_; // tails claimed and processing
|
||||
std::set<int> sqeDoneTails_; // tails done processing
|
||||
};
|
||||
|
||||
struct IoRingTable {
|
||||
void init(int cap) {
|
||||
for (int prio = 0; prio <= 2; ++prio) {
|
||||
auto sp = "/" + semOpenPath(prio);
|
||||
sems.emplace_back(sem_open(sp.c_str(), O_CREAT, 0666, 0), [sp](sem_t *p) {
|
||||
sem_close(p);
|
||||
sem_unlink(sp.c_str());
|
||||
});
|
||||
chmod(semPath(prio).c_str(), 0666);
|
||||
}
|
||||
ioRings = std::make_unique<AtomicSharedPtrTable<IoRing>>(cap);
|
||||
}
|
||||
Result<int> addIoRing(const Path &mountName,
|
||||
std::shared_ptr<lib::ShmBuf> shm,
|
||||
std::string_view name,
|
||||
const meta::UserInfo &ui,
|
||||
bool forRead,
|
||||
uint8_t *buf,
|
||||
size_t size,
|
||||
int ioDepth,
|
||||
const hf3fs::lib::IorAttrs &attrs) {
|
||||
auto idxRes = ioRings->alloc();
|
||||
if (!idxRes) {
|
||||
return makeError(ClientAgentCode::kTooManyOpenFiles, "too many io rings");
|
||||
}
|
||||
|
||||
auto idx = *idxRes;
|
||||
|
||||
auto ior = std::make_shared<
|
||||
IoRing>(std::move(shm), name, ui, forRead, buf, size, ioDepth, attrs.priority, attrs.timeout, attrs.flags);
|
||||
ior->mountName = mountName.native();
|
||||
ioRings->table[idx].store(ior);
|
||||
|
||||
return idx;
|
||||
}
|
||||
void rmIoRing(int idx) { ioRings->remove(idx); }
|
||||
std::vector<std::unique_ptr<sem_t, std::function<void(sem_t *)>>> sems;
|
||||
std::unique_ptr<AtomicSharedPtrTable<IoRing>> ioRings;
|
||||
|
||||
private:
|
||||
static std::string semOpenPath(int prio) {
|
||||
static std::vector<Uuid> semIds{Uuid::random(), Uuid::random(), Uuid::random()};
|
||||
return fmt::format("hf3fs-submit-ios.{}", semIds[prio].toHexString());
|
||||
}
|
||||
|
||||
public:
|
||||
static std::string semName(int prio) {
|
||||
return fmt::format("submit-ios{}", prio == 1 ? "" : prio == 0 ? ".ph" : ".pl");
|
||||
}
|
||||
static Path semPath(int prio) { return Path("/dev/shm") / ("sem." + semOpenPath(prio)); }
|
||||
static meta::Inode lookupSem(int prio) {
|
||||
static const std::vector<meta::Inode> inodes{
|
||||
{meta::InodeId{meta::InodeId::iovDir().u64() - 1},
|
||||
meta::InodeData{meta::Symlink{semPath(0)}, meta::Acl{meta::Uid{0}, meta::Gid{0}, meta::Permission{0666}}}},
|
||||
{meta::InodeId{meta::InodeId::iovDir().u64() - 2},
|
||||
meta::InodeData{meta::Symlink{semPath(1)}, meta::Acl{meta::Uid{0}, meta::Gid{0}, meta::Permission{0666}}}},
|
||||
{meta::InodeId{meta::InodeId::iovDir().u64() - 3},
|
||||
meta::InodeData{meta::Symlink{semPath(2)}, meta::Acl{meta::Uid{0}, meta::Gid{0}, meta::Permission{0666}}}}};
|
||||
|
||||
return inodes[prio];
|
||||
}
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
337
src/fuse/IovTable.cc
Normal file
337
src/fuse/IovTable.cc
Normal file
@@ -0,0 +1,337 @@
|
||||
#include "IovTable.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
|
||||
#include "IoRing.h"
|
||||
#include "fbs/meta/Common.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
|
||||
using hf3fs::lib::IorAttrs;
|
||||
|
||||
const Path linkPref = "/dev/shm";
|
||||
|
||||
void IovTable::init(const Path &mount, int cap) {
|
||||
mountName = mount.native();
|
||||
iovs = std::make_unique<AtomicSharedPtrTable<lib::ShmBuf>>(cap);
|
||||
}
|
||||
|
||||
struct IovAttrs {
|
||||
Uuid id;
|
||||
size_t blockSize = 0;
|
||||
bool isIoRing = false;
|
||||
bool forRead = true;
|
||||
int ioDepth = 0;
|
||||
std::optional<IorAttrs> iora;
|
||||
};
|
||||
|
||||
static Result<IovAttrs> parseKey(const char *key) {
|
||||
IovAttrs iova;
|
||||
|
||||
std::vector<std::string> fnParts;
|
||||
folly::split('.', key, fnParts);
|
||||
|
||||
auto idRes = Uuid::fromHexString(fnParts[0]);
|
||||
RETURN_ON_ERROR(idRes);
|
||||
iova.id = *idRes;
|
||||
|
||||
for (size_t i = 1; i < fnParts.size(); ++i) {
|
||||
auto dec = fnParts[i];
|
||||
switch (dec[0]) {
|
||||
case 'b': { // block size
|
||||
auto i = atoll(dec.c_str() + 1);
|
||||
if (i <= 0) {
|
||||
return makeError(StatusCode::kInvalidArg, "invalid block size set in shm key");
|
||||
}
|
||||
iova.blockSize = (size_t)i;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'r':
|
||||
case 'w': { // is io ring
|
||||
auto i = atoll(dec.c_str() + 1);
|
||||
iova.isIoRing = true;
|
||||
iova.forRead = dec[0] == 'r';
|
||||
iova.ioDepth = i;
|
||||
break;
|
||||
}
|
||||
|
||||
case 't': {
|
||||
if (!iova.iora) {
|
||||
iova.iora = IorAttrs{};
|
||||
}
|
||||
auto i = atoi(dec.c_str() + 1);
|
||||
if (i < 0) {
|
||||
return makeError(StatusCode::kInvalidArg, "invalid io job check timeout {}", dec.c_str() + 1);
|
||||
}
|
||||
iova.iora->timeout = Duration(std::chrono::nanoseconds((uint64_t)i * 1000000));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'f': {
|
||||
if (!iova.iora) {
|
||||
iova.iora = IorAttrs{};
|
||||
}
|
||||
char *ep;
|
||||
auto i = strtoull(dec.c_str() + 1, &ep, 2);
|
||||
if (*ep != 0 || i < 0) {
|
||||
return makeError(StatusCode::kInvalidArg, "invalid io exec flags {}", dec.c_str() + 1);
|
||||
}
|
||||
iova.iora->flags = i;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'p': // should be io ring, priority
|
||||
if (!iova.iora) {
|
||||
iova.iora = IorAttrs{};
|
||||
}
|
||||
switch (dec.c_str()[1]) {
|
||||
case 'l':
|
||||
iova.iora->priority = 2;
|
||||
break;
|
||||
case 'h':
|
||||
iova.iora->priority = 0;
|
||||
break;
|
||||
case 'n':
|
||||
case '\0':
|
||||
iova.iora->priority = 1;
|
||||
break;
|
||||
default:
|
||||
return makeError(StatusCode::kInvalidArg, "invalid priority set in shm key");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!iova.isIoRing && iova.iora) {
|
||||
return makeError(StatusCode::kInvalidArg, "ioring attrs set for non-ioring");
|
||||
}
|
||||
|
||||
return iova;
|
||||
}
|
||||
|
||||
constexpr int iovIidStart = meta::InodeId::iovIidStart;
|
||||
|
||||
std::optional<int> IovTable::iovDesc(meta::InodeId iid) {
|
||||
auto iidn = (ssize_t)iid.u64();
|
||||
auto diid = (ssize_t)meta::InodeId::iovDir().u64();
|
||||
if (iidn >= 0 || iidn > diid - iovIidStart || iidn < diid - std::numeric_limits<int>::max()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return diid - iidn - iovIidStart;
|
||||
}
|
||||
|
||||
Result<std::pair<meta::Inode, std::shared_ptr<lib::ShmBuf>>> IovTable::addIov(const char *key,
|
||||
const Path &shmPath,
|
||||
pid_t pid,
|
||||
const meta::UserInfo &ui,
|
||||
folly::Executor::KeepAlive<> exec,
|
||||
storage::client::StorageClient &sc) {
|
||||
static monitor::DistributionRecorder mapTimesCount("fuse.iov.times", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder mapBytesDist("fuse.iov.bytes", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::CountRecorder shmSizeCount("fuse.iov.total_bytes", monitor::TagSet{{"mount_name", mountName}}, false);
|
||||
static monitor::LatencyRecorder allocLatency("fuse.iov.latency.map", monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::DistributionRecorder ibRegBytesDist("fuse.iov.bytes.ib_reg",
|
||||
monitor::TagSet{{"mount_name", mountName}});
|
||||
static monitor::LatencyRecorder ibRegLatency("fuse.iov.latency.ib_reg", monitor::TagSet{{"mount_name", mountName}});
|
||||
|
||||
auto iovaRes = parseKey(key);
|
||||
RETURN_ON_ERROR(iovaRes);
|
||||
|
||||
Path shmOpenPath("/");
|
||||
shmOpenPath /= shmPath.lexically_relative(linkPref);
|
||||
|
||||
struct stat st;
|
||||
if (stat(shmPath.c_str(), &st) == -1 || !S_ISREG(st.st_mode)) {
|
||||
return makeError(StatusCode::kInvalidArg, "failed to stat shm path or it's not a regular file");
|
||||
}
|
||||
|
||||
if (iovaRes->blockSize > (size_t)st.st_size) {
|
||||
return makeError(StatusCode::kInvalidArg, "invalid block size set in shm key");
|
||||
} else if (iovaRes->isIoRing && iovaRes->ioDepth > IoRing::ioRingEntries((size_t)st.st_size)) {
|
||||
return makeError(StatusCode::kInvalidArg, "invalid io batch size set in shm key");
|
||||
}
|
||||
|
||||
while (true) {
|
||||
auto iovdRes = iovs->alloc();
|
||||
if (!iovdRes) {
|
||||
return makeError(ClientAgentCode::kTooManyOpenFiles, "too many iovs allocated");
|
||||
}
|
||||
auto iovd = *iovdRes;
|
||||
bool dealloc = true;
|
||||
SCOPE_EXIT {
|
||||
if (dealloc) {
|
||||
iovs->dealloc(iovd);
|
||||
}
|
||||
};
|
||||
|
||||
auto start = SteadyClock::now();
|
||||
auto uids = std::to_string(ui.uid.toUnderType());
|
||||
|
||||
std::shared_ptr<lib::ShmBuf> shm;
|
||||
try {
|
||||
shm.reset(
|
||||
new lib::ShmBuf(shmOpenPath, 0, st.st_size, iovaRes->blockSize, iovaRes->id),
|
||||
[uids,
|
||||
&shmSizeCount = shmSizeCount,
|
||||
&mapTimesCount = mapTimesCount,
|
||||
&mapBytesDist = mapBytesDist,
|
||||
&allocLatency = allocLatency,
|
||||
&ibRegLatency = ibRegLatency](auto p) {
|
||||
auto start = SteadyClock::now();
|
||||
folly::coro::blockingWait(p->deregisterForIO());
|
||||
auto now = SteadyClock::now();
|
||||
ibRegLatency.addSample(now - start, monitor::TagSet{{"instance", "dereg"}, {"uid", uids}});
|
||||
|
||||
start = now;
|
||||
p->unmapBuf();
|
||||
allocLatency.addSample(SteadyClock::now() - start, monitor::TagSet{{"instance", "free"}, {"uid", uids}});
|
||||
|
||||
mapTimesCount.addSample(1, monitor::TagSet{{"instance", "free"}, {"uid", uids}});
|
||||
mapBytesDist.addSample(p->size, monitor::TagSet{{"instance", "free"}, {"uid", uids}});
|
||||
shmSizeCount.addSample(-p->size);
|
||||
|
||||
delete p;
|
||||
});
|
||||
} catch (const std::runtime_error &e) {
|
||||
return makeError(ClientAgentCode::kIovShmFail, std::string("failed to open/map shm for iov ") + e.what());
|
||||
}
|
||||
|
||||
allocLatency.addSample(SteadyClock::now() - start, monitor::TagSet{{"instance", "alloc"}, {"uid", uids}});
|
||||
mapTimesCount.addSample(1, monitor::TagSet{{"instance", "alloc"}, {"uid", uids}});
|
||||
mapBytesDist.addSample(shm->size, monitor::TagSet{{"instance", "alloc"}, {"uid", uids}});
|
||||
shmSizeCount.addSample(shm->size, monitor::TagSet{{"uid", uids}});
|
||||
|
||||
shm->key = key;
|
||||
shm->user = ui.uid;
|
||||
shm->pid = pid;
|
||||
shm->isIoRing = iovaRes->isIoRing;
|
||||
shm->forRead = iovaRes->forRead;
|
||||
shm->ioDepth = iovaRes->ioDepth;
|
||||
shm->iora = iovaRes->iora;
|
||||
|
||||
// the idx should be reserved by us
|
||||
iovs->table[iovd].store(shm);
|
||||
|
||||
start = SteadyClock::now();
|
||||
auto recordMetrics = [blockSize = shm->blockSize, start, uids]() mutable {
|
||||
ibRegBytesDist.addSample(blockSize, monitor::TagSet{{"instance", "reg"}, {"uid", uids}});
|
||||
ibRegLatency.addSample(SteadyClock::now() - start, monitor::TagSet{{"instance", "reg"}, {"uid", uids}});
|
||||
};
|
||||
|
||||
if (!iovaRes->isIoRing) { // io ring bufs don't need to be registered for ib io
|
||||
folly::coro::blockingWait(shm->registerForIO(exec, sc, recordMetrics));
|
||||
}
|
||||
|
||||
{
|
||||
std::unique_lock lock(iovdLock_);
|
||||
iovds_[key] = iovd;
|
||||
}
|
||||
|
||||
{
|
||||
std::unique_lock lock(shmLock);
|
||||
shmsById[iovaRes->id] = iovd;
|
||||
}
|
||||
|
||||
auto statRes = statIov(iovd, ui);
|
||||
RETURN_ON_ERROR(statRes);
|
||||
|
||||
dealloc = false;
|
||||
return std::make_pair(*statRes, iovaRes->isIoRing ? shm : std::shared_ptr<lib::ShmBuf>());
|
||||
}
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<lib::ShmBuf>> IovTable::rmIov(const char *key, const meta::UserInfo &ui) {
|
||||
auto res = lookupIov(key, ui);
|
||||
RETURN_ON_ERROR(res);
|
||||
|
||||
{
|
||||
std::unique_lock lock(iovdLock_);
|
||||
iovds_.erase(key);
|
||||
}
|
||||
|
||||
{
|
||||
auto res = parseKey(key);
|
||||
|
||||
std::unique_lock lock(shmLock);
|
||||
shmsById.erase(res->id);
|
||||
}
|
||||
|
||||
auto iovd = iovDesc(res->id);
|
||||
auto shm = iovs->table[*iovd].load();
|
||||
iovs->remove(*iovd);
|
||||
|
||||
return shm;
|
||||
}
|
||||
|
||||
Result<meta::Inode> IovTable::statIov(int iovd, const meta::UserInfo &ui) {
|
||||
if (iovd < 0 || iovd >= (int)iovs->table.size()) {
|
||||
return makeError(MetaCode::kNotFound, "invalid iov desc");
|
||||
}
|
||||
|
||||
auto shm = iovs->table[iovd].load();
|
||||
if (!shm) {
|
||||
return makeError(MetaCode::kNotFound,
|
||||
fmt::format("iov desc {} not found, next avail {}", iovd, iovs->slots.nextAvail.load()));
|
||||
}
|
||||
|
||||
if (shm->user != ui.uid) {
|
||||
XLOGF(ERR, "statting user {} iov belongs to {}", ui.uid, shm->user);
|
||||
return makeError(MetaCode::kNoPermission, "iov not for user");
|
||||
}
|
||||
|
||||
return meta::Inode{
|
||||
meta::InodeId::iov(iovd),
|
||||
meta::InodeData{meta::Symlink{linkPref / shm->path}, meta::Acl{ui.uid, ui.gid, meta::Permission(0400)}}};
|
||||
}
|
||||
|
||||
Result<meta::Inode> IovTable::lookupIov(const char *key, const meta::UserInfo &ui) {
|
||||
int iovd = -1;
|
||||
{
|
||||
std::shared_lock lock(iovdLock_);
|
||||
auto it = iovds_.find(key);
|
||||
if (it == iovds_.end()) {
|
||||
return makeError(MetaCode::kNotFound, std::string("iov key not found ") + key);
|
||||
} else {
|
||||
iovd = it->second;
|
||||
}
|
||||
}
|
||||
|
||||
return statIov(iovd, ui);
|
||||
}
|
||||
|
||||
std::pair<std::shared_ptr<std::vector<meta::DirEntry>>, std::shared_ptr<std::vector<std::optional<meta::Inode>>>>
|
||||
IovTable::listIovs(const meta::UserInfo &ui) {
|
||||
meta::DirEntry de{meta::InodeId::iovDir(), ""};
|
||||
|
||||
auto n = iovs->slots.nextAvail.load();
|
||||
std::vector<meta::DirEntry> des;
|
||||
std::vector<std::optional<meta::Inode>> ins;
|
||||
des.reserve(n + 3);
|
||||
ins.reserve(n + 3);
|
||||
|
||||
for (int prio = 0; prio <= 2; ++prio) {
|
||||
de.name = IoRingTable::semName(prio);
|
||||
des.emplace_back(de);
|
||||
|
||||
auto inode = IoRingTable::lookupSem(prio);
|
||||
ins.emplace_back(std::move(inode));
|
||||
}
|
||||
|
||||
meta::Acl acl{meta::Uid{ui.uid}, meta::Gid{ui.gid}, meta::Permission{0400}};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto iov = iovs->table[i].load();
|
||||
if (!iov || iov->user != ui.uid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
de.name = iov->key;
|
||||
des.emplace_back(de);
|
||||
ins.emplace_back(
|
||||
meta::Inode{meta::InodeId{meta::InodeId::iov(i)}, meta::InodeData{meta::Symlink{linkPref / iov->path}, acl}});
|
||||
}
|
||||
|
||||
return std::make_pair(std::make_shared<std::vector<meta::DirEntry>>(std::move(des)),
|
||||
std::make_shared<std::vector<std::optional<meta::Inode>>>(std::move(ins)));
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
39
src/fuse/IovTable.h
Normal file
39
src/fuse/IovTable.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "common/utils/AtomicSharedPtrTable.h"
|
||||
#include "fbs/meta/Schema.h"
|
||||
#include "lib/common/Shm.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
class IovTable {
|
||||
public:
|
||||
IovTable() = default;
|
||||
void init(const Path &mount, int cap);
|
||||
Result<std::pair<meta::Inode, std::shared_ptr<lib::ShmBuf>>> addIov(const char *key,
|
||||
const Path &shmPath,
|
||||
pid_t pid,
|
||||
const meta::UserInfo &ui,
|
||||
folly::Executor::KeepAlive<> exec,
|
||||
storage::client::StorageClient &sc);
|
||||
Result<std::shared_ptr<lib::ShmBuf>> rmIov(const char *key, const meta::UserInfo &ui);
|
||||
Result<meta::Inode> lookupIov(const char *key, const meta::UserInfo &ui);
|
||||
std::optional<int> iovDesc(meta::InodeId iid);
|
||||
Result<meta::Inode> statIov(int key, const meta::UserInfo &ui);
|
||||
|
||||
public:
|
||||
std::pair<std::shared_ptr<std::vector<meta::DirEntry>>, std::shared_ptr<std::vector<std::optional<meta::Inode>>>>
|
||||
listIovs(const meta::UserInfo &ui);
|
||||
|
||||
public:
|
||||
std::string mountName;
|
||||
std::shared_mutex shmLock;
|
||||
robin_hood::unordered_map<Uuid, int> shmsById;
|
||||
std::unique_ptr<AtomicSharedPtrTable<lib::ShmBuf>> iovs;
|
||||
|
||||
private:
|
||||
mutable std::shared_mutex iovdLock_;
|
||||
robin_hood::unordered_map<std::string, int> iovds_;
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
275
src/fuse/PioV.cc
Normal file
275
src/fuse/PioV.cc
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "PioV.h"
|
||||
|
||||
namespace hf3fs::lib::agent {
|
||||
PioV::PioV(storage::client::StorageClient &storageClient, int chunkSizeLim, std::vector<ssize_t> &res)
|
||||
: storageClient_(storageClient),
|
||||
chunkSizeLim_(chunkSizeLim),
|
||||
res_(res) {
|
||||
auto &mgmtdClient = storageClient_.getMgmtdClient();
|
||||
auto routingInfo = mgmtdClient.getRoutingInfo();
|
||||
XLOGF_IF(DFATAL, !routingInfo || !routingInfo->raw(), "RoutingInfo not found");
|
||||
routingInfo_ = routingInfo->raw();
|
||||
}
|
||||
|
||||
hf3fs::Result<Void> PioV::addRead(size_t idx,
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
void *buf,
|
||||
storage::client::IOBuffer &memh) {
|
||||
if (!wios_.empty()) {
|
||||
return makeError(StatusCode::kInvalidArg, "adding read to write operations");
|
||||
} else if (!inode.isFile()) {
|
||||
res_[idx] = -static_cast<ssize_t>(MetaCode::kNotFile);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
if (rios_.empty()) {
|
||||
rios_.reserve(res_.size());
|
||||
}
|
||||
|
||||
size_t bufOff = 0;
|
||||
RETURN_ON_ERROR(chunkIo(inode,
|
||||
track,
|
||||
off,
|
||||
len,
|
||||
[this, &memh, &bufOff, idx, buf](storage::ChainId chain,
|
||||
storage::ChunkId chunk,
|
||||
uint32_t,
|
||||
uint32_t chunkOff,
|
||||
uint32_t chunkLen) {
|
||||
rios_.emplace_back(storageClient_.createReadIO(chain,
|
||||
chunk,
|
||||
chunkOff,
|
||||
chunkLen,
|
||||
(uint8_t *)buf + bufOff,
|
||||
&memh,
|
||||
reinterpret_cast<void *>(idx)));
|
||||
bufOff += chunkLen;
|
||||
}));
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
hf3fs::Result<Void> PioV::addWrite(size_t idx,
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
const void *buf,
|
||||
storage::client::IOBuffer &memh) {
|
||||
if (!rios_.empty()) {
|
||||
return makeError(StatusCode::kInvalidArg, "adding write to read operations");
|
||||
} else if (!inode.isFile()) {
|
||||
res_[idx] = -static_cast<ssize_t>(MetaCode::kNotFile);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
if (wios_.empty()) {
|
||||
wios_.reserve(res_.size());
|
||||
}
|
||||
|
||||
size_t bufOff = 0;
|
||||
RETURN_ON_ERROR(chunkIo(inode,
|
||||
track,
|
||||
off,
|
||||
len,
|
||||
[this, &inode, &memh, &bufOff, idx, buf, off](storage::ChainId chain,
|
||||
storage::ChunkId chunk,
|
||||
uint32_t chunkSize,
|
||||
uint32_t chunkOff,
|
||||
uint32_t chunkLen) {
|
||||
wios_.emplace_back(storageClient_.createWriteIO(chain,
|
||||
chunk,
|
||||
chunkOff,
|
||||
chunkLen,
|
||||
chunkSize,
|
||||
(uint8_t *)buf + bufOff,
|
||||
&memh,
|
||||
reinterpret_cast<void *>(idx)));
|
||||
bufOff += chunkLen;
|
||||
potentialLens_[inode.id] = std::max(potentialLens_[inode.id], off + bufOff + chunkLen);
|
||||
}));
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> PioV::chunkIo(
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
std::function<void(storage::ChainId, storage::ChunkId, uint32_t, uint32_t, uint32_t)> &&consumeChunk) {
|
||||
const auto &f = inode.asFile();
|
||||
auto chunkSize = f.layout.chunkSize;
|
||||
auto chunkOff = off % chunkSize;
|
||||
|
||||
auto rcs = chunkSizeLim_ ? std::min((size_t)chunkSizeLim_, chunkSize.u64()) : chunkSize.u64();
|
||||
|
||||
for (size_t lastL = 0, l = std::min((size_t)(chunkSize - chunkOff), len); // l is within a chunk
|
||||
l < len + chunkSize; // for the last chunk
|
||||
lastL = l, l += chunkSize) {
|
||||
l = std::min(l, len); // l is always growing longer
|
||||
auto opOff = off + lastL;
|
||||
|
||||
auto chain = f.getChainId(inode, opOff, *routingInfo_, track);
|
||||
RETURN_ON_ERROR(chain);
|
||||
auto fchunk = f.getChunkId(inode.id, opOff);
|
||||
RETURN_ON_ERROR(fchunk);
|
||||
auto chunk = storage::ChunkId(*fchunk);
|
||||
auto chunkLen = l - lastL;
|
||||
|
||||
for (size_t co = 0; co < chunkLen; co += rcs) {
|
||||
consumeChunk(*chain, chunk, chunkSize, chunkOff + co, std::min(rcs, chunkLen - co));
|
||||
}
|
||||
|
||||
chunkOff = 0; // chunks other than first always starts from 0
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
CoTryTask<void> PioV::executeRead(const UserInfo &userInfo, const storage::client::ReadOptions &options) {
|
||||
assert(wios_.empty() && trops_.empty());
|
||||
|
||||
if (rios_.empty()) {
|
||||
co_return Void{};
|
||||
}
|
||||
|
||||
co_return co_await storageClient_.batchRead(rios_, userInfo, options);
|
||||
}
|
||||
|
||||
CoTryTask<void> PioV::executeWrite(const UserInfo &userInfo, const storage::client::WriteOptions &options) {
|
||||
assert(rios_.empty());
|
||||
|
||||
if (wios_.empty()) {
|
||||
co_return Void{};
|
||||
}
|
||||
|
||||
if (!trops_.empty()) {
|
||||
std::vector<storage::client::TruncateChunkOp *> failed;
|
||||
std::set<size_t> badWios;
|
||||
auto r = co_await storageClient_.truncateChunks(trops_, userInfo, options, &failed);
|
||||
CO_RETURN_ON_ERROR(r);
|
||||
if (!failed.empty()) {
|
||||
for (auto op : failed) {
|
||||
res_[reinterpret_cast<size_t>(op->userCtx)] = -static_cast<ssize_t>(op->result.lengthInfo.error().code());
|
||||
for (size_t i = 0; i < wios_.size(); ++i) {
|
||||
if (wios_[i].userCtx == op->userCtx) {
|
||||
badWios.insert(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<storage::client::WriteIO> wios2;
|
||||
wios2.reserve(wios_.size() - badWios.size());
|
||||
for (size_t i = 0; i < wios_.size(); ++i) {
|
||||
if (badWios.find(i) == badWios.end()) {
|
||||
auto &wio = wios_[i];
|
||||
wios2.emplace_back(storageClient_.createWriteIO(wio.routingTarget.chainId,
|
||||
wio.chunkId,
|
||||
wio.offset,
|
||||
wio.length,
|
||||
wio.chunkSize,
|
||||
wio.data,
|
||||
wio.buffer,
|
||||
wio.userCtx));
|
||||
}
|
||||
}
|
||||
std::swap(wios_, wios2);
|
||||
}
|
||||
}
|
||||
|
||||
co_return co_await storageClient_.batchWrite(wios_, userInfo, options);
|
||||
}
|
||||
|
||||
template <typename Io>
|
||||
void concatIoRes(bool read, std::vector<ssize_t> &res, const Io &ios, bool allowHoles) {
|
||||
ssize_t lastIovIdx = -1;
|
||||
bool inHole = false;
|
||||
std::optional<size_t> holeIo = 0;
|
||||
size_t holeOff = 0;
|
||||
size_t holeSize = 0;
|
||||
ssize_t iovIdx = 0;
|
||||
for (size_t i = 0; i < ios.size(); ++i, lastIovIdx = iovIdx) {
|
||||
const auto &io = ios[i];
|
||||
iovIdx = reinterpret_cast<ssize_t>(io.userCtx);
|
||||
uint32_t iolen = 0;
|
||||
if (io.result.lengthInfo) {
|
||||
iolen = *io.result.lengthInfo;
|
||||
if (iolen > 0 && inHole && lastIovIdx == iovIdx) {
|
||||
// the front part of the data read from a chunk can never be part of a hole when anything is read from the chunk
|
||||
// storage server promises that, or how can it tell us that it only reads into the buffer from the middle?
|
||||
// so the hole size always ends at the last chunk end, and we can add it to res to calc the correct read size
|
||||
// and if the hole is not a hole, but the eof, the prev res will be the no. of bytes read from the file
|
||||
|
||||
const auto &lastIo = ios[i - 1];
|
||||
auto lastChunk = meta::ChunkId::unpack(lastIo.chunkId.data());
|
||||
auto chunk = meta::ChunkId::unpack(io.chunkId.data());
|
||||
XLOGF(ERR,
|
||||
"found hole when {}ing inode id {}, hole starts before chunk idx {} chain id {} got {}_B in chunk "
|
||||
"idx {} / {} chain id {} after hole iov idx {} last iov idx {} hole io idx {} off in first io {} size {}",
|
||||
read ? "read" : "writ",
|
||||
lastChunk.inode().u64(),
|
||||
lastChunk.chunk(),
|
||||
lastIo.routingTarget.chainId,
|
||||
iolen,
|
||||
chunk.inode().u64(),
|
||||
chunk.chunk(),
|
||||
io.routingTarget.chainId,
|
||||
iovIdx,
|
||||
lastIovIdx,
|
||||
*holeIo,
|
||||
holeOff,
|
||||
holeSize);
|
||||
|
||||
if (read && allowHoles) { // zerofill the hole we found
|
||||
auto &hio = ios[*holeIo];
|
||||
memset(hio.data + holeOff, 0, hio.length - holeOff);
|
||||
for (size_t j = *holeIo + 1; j < i; ++j) {
|
||||
memset(ios[j].data, 0, ios[j].length);
|
||||
}
|
||||
|
||||
res[iovIdx] += holeSize;
|
||||
|
||||
inHole = false; // out of hole now, but we may begin a new hole
|
||||
holeIo = std::nullopt;
|
||||
} else {
|
||||
res[iovIdx] = -static_cast<ssize_t>(ClientAgentCode::kHoleInIoOutcome);
|
||||
}
|
||||
} else if (lastIovIdx != iovIdx) {
|
||||
inHole = false;
|
||||
holeIo = std::nullopt;
|
||||
}
|
||||
} else if (read && io.result.lengthInfo.error().code() == StorageClientCode::kChunkNotFound) {
|
||||
// ignore
|
||||
} else {
|
||||
if (res[iovIdx] >= 0) {
|
||||
res[iovIdx] = -static_cast<ssize_t>(io.result.lengthInfo.error().code());
|
||||
}
|
||||
}
|
||||
|
||||
if (res[iovIdx] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (iolen < io.length) { // shorter than expected
|
||||
inHole = true;
|
||||
if (!holeIo) {
|
||||
holeIo = i;
|
||||
holeOff = iolen;
|
||||
holeSize = 0;
|
||||
}
|
||||
holeSize += io.length - iolen;
|
||||
}
|
||||
res[iovIdx] += iolen;
|
||||
}
|
||||
}
|
||||
|
||||
void PioV::finishIo(bool allowHoles) {
|
||||
if (wios_.empty()) {
|
||||
concatIoRes(true, res_, rios_, allowHoles);
|
||||
} else {
|
||||
concatIoRes(false, res_, wios_, false);
|
||||
}
|
||||
}
|
||||
} // namespace hf3fs::lib::agent
|
||||
59
src/fuse/PioV.h
Normal file
59
src/fuse/PioV.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
|
||||
#include "client/meta/MetaClient.h"
|
||||
#include "client/storage/StorageClient.h"
|
||||
#include "common/utils/Result.h"
|
||||
|
||||
namespace hf3fs::lib::agent {
|
||||
using flat::UserInfo;
|
||||
class PioV {
|
||||
public:
|
||||
PioV(storage::client::StorageClient &storageClient, int chunkSizeLim, std::vector<ssize_t> &res);
|
||||
hf3fs::Result<Void> addRead(size_t idx,
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
void *buf,
|
||||
storage::client::IOBuffer &memh);
|
||||
// if metaClient and userInfo are not nullptr,
|
||||
// meta server will be contacted for latest file length if known length is shorter than off
|
||||
CoTryTask<bool> checkWriteOff(size_t idx,
|
||||
meta::client::MetaClient *metaClient,
|
||||
const UserInfo *userInfo,
|
||||
const meta::Inode &inode,
|
||||
size_t off);
|
||||
hf3fs::Result<Void> addWrite(size_t idx,
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
const void *buf,
|
||||
storage::client::IOBuffer &memh);
|
||||
CoTryTask<void> executeRead(const UserInfo &userInfo,
|
||||
const storage::client::ReadOptions &options = storage::client::ReadOptions());
|
||||
CoTryTask<void> executeWrite(const UserInfo &userInfo,
|
||||
const storage::client::WriteOptions &options = storage::client::WriteOptions());
|
||||
void finishIo(bool allowHoles);
|
||||
|
||||
private:
|
||||
Result<Void> chunkIo(
|
||||
const meta::Inode &inode,
|
||||
uint16_t track,
|
||||
off_t off,
|
||||
size_t len,
|
||||
std::function<void(storage::ChainId, storage::ChunkId, uint32_t, uint32_t, uint32_t)> &&consumeChunk);
|
||||
|
||||
private:
|
||||
storage::client::StorageClient &storageClient_;
|
||||
int chunkSizeLim_;
|
||||
std::shared_ptr<flat::RoutingInfo> routingInfo_;
|
||||
std::vector<ssize_t> &res_;
|
||||
std::vector<storage::client::ReadIO> rios_;
|
||||
std::vector<storage::client::WriteIO> wios_;
|
||||
std::vector<storage::client::TruncateChunkOp> trops_;
|
||||
std::map<meta::InodeId, size_t> potentialLens_;
|
||||
};
|
||||
} // namespace hf3fs::lib::agent
|
||||
174
src/fuse/UserConfig.cc
Normal file
174
src/fuse/UserConfig.cc
Normal file
@@ -0,0 +1,174 @@
|
||||
#include "UserConfig.h"
|
||||
|
||||
#include "fbs/meta/Common.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
void UserConfig::init(FuseConfig &config) {
|
||||
config_ = &config;
|
||||
configs_.reset(new AtomicSharedPtrTable<LocalConfig>(config.max_uid() + 1));
|
||||
|
||||
storageMaxConcXmit_ = config.storage().net_client().rdma_control().max_concurrent_transmission();
|
||||
|
||||
config.addCallbackGuard([&config = config, this] {
|
||||
storageMaxConcXmit_ = config.storage().net_client().rdma_control().max_concurrent_transmission();
|
||||
|
||||
std::lock_guard lock(userMtx_);
|
||||
for (auto u : users_) {
|
||||
auto lconf = configs_->table[u.toUnderType()].load();
|
||||
if (!lconf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
FuseConfig conf2 = config;
|
||||
|
||||
std::lock_guard lock2(lconf->mtx);
|
||||
conf2.atomicallyUpdate(lconf->updatedItems, true);
|
||||
lconf->config = std::move(conf2);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Result<std::pair<bool, int>> UserConfig::parseKey(const char *key) {
|
||||
if (!strncmp(key, "sys.", 4)) {
|
||||
auto it = std::find(systemKeys.begin(), systemKeys.end(), key + 4);
|
||||
if (it == systemKeys.end()) {
|
||||
return makeError(StatusCode::kInvalidArg, fmt::format("no such system key or key not customizable {}", key));
|
||||
} else {
|
||||
return std::make_pair(true, it - systemKeys.begin());
|
||||
}
|
||||
} else if (!strncmp(key, "usr.", 4)) {
|
||||
auto it = std::find(userKeys.begin(), userKeys.end(), key + 4);
|
||||
if (it == userKeys.end()) {
|
||||
return makeError(StatusCode::kInvalidArg, fmt::format("no such user key or key not customizable {}", key));
|
||||
} else {
|
||||
return std::make_pair(false, it - userKeys.begin());
|
||||
}
|
||||
} else {
|
||||
return makeError(StatusCode::kInvalidArg, fmt::format("key {} has to be prefixed with 'sys.' or 'usr.'", key));
|
||||
}
|
||||
}
|
||||
|
||||
Result<meta::Inode> UserConfig::setConfig(const char *key, const char *val, const meta::UserInfo &ui) {
|
||||
auto kres = parseKey(key);
|
||||
RETURN_ON_ERROR(kres);
|
||||
key += 4;
|
||||
|
||||
auto [isSys, kidx] = *kres;
|
||||
if (isSys) {
|
||||
if (!strcmp(key, "storage.net_client.rdma_control.max_concurrent_transmission")) {
|
||||
auto n = atoi(val);
|
||||
if (n <= 0 || n > 2 * storageMaxConcXmit_) {
|
||||
return makeError(
|
||||
StatusCode::kInvalidArg,
|
||||
fmt::format(
|
||||
"invalid value '{}' for key '{}', possible reason is it is larger than twice of system setting {}",
|
||||
val,
|
||||
key - 4,
|
||||
storageMaxConcXmit_.load()));
|
||||
}
|
||||
}
|
||||
|
||||
RETURN_ON_ERROR(config_->atomicallyUpdate({std::make_pair(key, val)}, true));
|
||||
return meta::Inode{configIid(false, true, kidx),
|
||||
{meta::Symlink{val}, meta::Acl{ui.uid, ui.gid, meta::Permission{0400}}}};
|
||||
} else {
|
||||
if (!strcmp(key, "readonly") && strcmp(val, "true") && config_->readonly()) {
|
||||
// if readonly is turned on cluster-wide, user cannot disable if locally
|
||||
return makeError(StatusCode::kInvalidArg, "cannot turn off readonly mode when it is turned on by the sys admin");
|
||||
}
|
||||
|
||||
auto uid = ui.uid;
|
||||
|
||||
std::lock_guard lock(userMtx_);
|
||||
auto uit = users_.find(uid);
|
||||
auto uidx = uid.toUnderType();
|
||||
|
||||
if (uit == users_.end()) {
|
||||
if (uidx >= configs_->table.size()) {
|
||||
return makeError(MetaCode::kNoPermission, fmt::format("uid {} too large for user config", uid));
|
||||
}
|
||||
|
||||
configs_->table[uidx].store(std::make_shared<LocalConfig>(*config_));
|
||||
users_.insert(uid);
|
||||
}
|
||||
|
||||
auto lconf = configs_->table[uidx].load();
|
||||
|
||||
auto kv = std::make_pair(key, val);
|
||||
auto res = lconf->config.atomicallyUpdate({kv}, true);
|
||||
RETURN_ON_ERROR(res);
|
||||
lconf->updatedItems.emplace_back(std::move(kv));
|
||||
|
||||
return meta::Inode{configIid(false, false, kidx), {meta::Symlink{val}, {uid, ui.gid, meta::Permission{0400}}}};
|
||||
}
|
||||
|
||||
return makeError(MetaCode::kNoPermission, fmt::format("key '{}' not found in config, or not allowed to be set", key));
|
||||
}
|
||||
|
||||
Result<meta::Inode> UserConfig::lookupConfig(const char *key, const meta::UserInfo &ui) {
|
||||
auto kres = parseKey(key);
|
||||
RETURN_ON_ERROR(kres);
|
||||
key += 4;
|
||||
|
||||
auto [isSys, kidx] = *kres;
|
||||
return statConfig(configIid(true, isSys, kidx), ui);
|
||||
}
|
||||
|
||||
const FuseConfig &UserConfig::getConfig(const meta::UserInfo &ui) {
|
||||
auto uid = ui.uid;
|
||||
|
||||
std::lock_guard lock(userMtx_);
|
||||
auto it = users_.find(uid);
|
||||
if (it == users_.end()) {
|
||||
return *config_;
|
||||
} else {
|
||||
auto lconf = configs_->table[uid.toUnderType()].load();
|
||||
return lconf->config;
|
||||
}
|
||||
}
|
||||
|
||||
Result<meta::Inode> UserConfig::statConfig(meta::InodeId iid, const meta::UserInfo &ui) {
|
||||
auto kidx = (int64_t)(meta::InodeId::getConf().u64() - 1 - iid.u64());
|
||||
if (kidx < 0 || kidx >= (int64_t)(systemKeys.size() + userKeys.size())) {
|
||||
return makeError(MetaCode::kNotFound, "iid not a config entry");
|
||||
}
|
||||
auto isSys = kidx < (int64_t)systemKeys.size();
|
||||
if (!isSys) {
|
||||
kidx -= (int)systemKeys.size();
|
||||
}
|
||||
|
||||
auto config = isSys ? *config_ : getConfig(ui);
|
||||
auto key = isSys ? systemKeys[kidx] : userKeys[kidx];
|
||||
return meta::Inode{iid,
|
||||
{meta::Symlink{config.find(key).value()->toString()},
|
||||
meta::Acl{ui.uid, ui.gid, meta::Permission{isSys ? 0444 : 0400}}}};
|
||||
}
|
||||
|
||||
std::pair<std::shared_ptr<std::vector<meta::DirEntry>>, std::shared_ptr<std::vector<std::optional<meta::Inode>>>>
|
||||
UserConfig::listConfig(const meta::UserInfo &ui) {
|
||||
meta::DirEntry de{meta::InodeId::getConf(), ""};
|
||||
|
||||
std::vector<meta::DirEntry> des;
|
||||
std::vector<std::optional<meta::Inode>> ins;
|
||||
des.reserve(systemKeys.size() + userKeys.size());
|
||||
ins.reserve(systemKeys.size() + userKeys.size());
|
||||
|
||||
for (const auto &k : systemKeys) {
|
||||
auto key = "sys." + k;
|
||||
de.name = key;
|
||||
des.emplace_back(de);
|
||||
auto inode = *lookupConfig(key.data(), ui);
|
||||
ins.emplace_back(std::move(inode));
|
||||
}
|
||||
for (const auto &k : userKeys) {
|
||||
auto key = "usr." + k;
|
||||
de.name = key;
|
||||
des.emplace_back(de);
|
||||
auto inode = *lookupConfig(key.data(), ui);
|
||||
ins.emplace_back(std::move(inode));
|
||||
}
|
||||
|
||||
return std::make_pair(std::make_shared<std::vector<meta::DirEntry>>(std::move(des)),
|
||||
std::make_shared<std::vector<std::optional<meta::Inode>>>(std::move(ins)));
|
||||
}
|
||||
} // namespace hf3fs::fuse
|
||||
64
src/fuse/UserConfig.h
Normal file
64
src/fuse/UserConfig.h
Normal file
@@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
|
||||
#include "FuseConfig.h"
|
||||
#include "common/utils/AtomicSharedPtrTable.h"
|
||||
#include "fbs/meta/Common.h"
|
||||
#include "fbs/meta/Schema.h"
|
||||
|
||||
namespace hf3fs::fuse {
|
||||
class UserConfig {
|
||||
public:
|
||||
UserConfig() = default;
|
||||
void init(FuseConfig &config);
|
||||
Result<meta::Inode> setConfig(const char *key, const char *val, const meta::UserInfo &ui);
|
||||
Result<meta::Inode> lookupConfig(const char *key, const meta::UserInfo &ui);
|
||||
Result<meta::Inode> statConfig(meta::InodeId iid, const meta::UserInfo &ui);
|
||||
std::pair<std::shared_ptr<std::vector<meta::DirEntry>>, std::shared_ptr<std::vector<std::optional<meta::Inode>>>>
|
||||
listConfig(const meta::UserInfo &ui);
|
||||
|
||||
public:
|
||||
const FuseConfig &getConfig(const meta::UserInfo &ui);
|
||||
|
||||
public:
|
||||
const std::vector<std::string> systemKeys{"storage.net_client.rdma_control.max_concurrent_transmission",
|
||||
"periodic_sync.enable",
|
||||
"periodic_sync.interval",
|
||||
"periodic_sync.flush_write_buf",
|
||||
"io_worker_coros.hi",
|
||||
"io_worker_coros.lo",
|
||||
"max_jobs_per_ioring",
|
||||
"io_job_deq_timeout"};
|
||||
const std::vector<std::string> userKeys{"enable_read_cache",
|
||||
"readonly",
|
||||
"dryrun_bench_mode",
|
||||
"flush_on_stat",
|
||||
"sync_on_stat",
|
||||
"attr_timeout",
|
||||
"entry_timeout",
|
||||
"negative_timeout",
|
||||
"symlink_timeout"};
|
||||
|
||||
private:
|
||||
Result<std::pair<bool, int>> parseKey(const char *key);
|
||||
meta::InodeId configIid(bool isGet, bool isSys, int kidx) {
|
||||
return meta::InodeId{(isGet ? meta::InodeId::getConf() : meta::InodeId::setConf()).u64() - 1 -
|
||||
(isSys ? 0 : systemKeys.size()) - kidx};
|
||||
}
|
||||
|
||||
private:
|
||||
FuseConfig *config_;
|
||||
struct LocalConfig {
|
||||
LocalConfig(const FuseConfig &globalConfig)
|
||||
: config(globalConfig) {}
|
||||
std::mutex mtx;
|
||||
FuseConfig config;
|
||||
std::vector<config::KeyValue> updatedItems;
|
||||
};
|
||||
std::unique_ptr<AtomicSharedPtrTable<LocalConfig>> configs_;
|
||||
std::mutex userMtx_;
|
||||
std::set<meta::Uid> users_;
|
||||
|
||||
private:
|
||||
std::atomic<int> storageMaxConcXmit_;
|
||||
};
|
||||
} // namespace hf3fs::fuse
|
||||
83
src/fuse/hf3fs_fuse.cpp
Normal file
83
src/fuse/hf3fs_fuse.cpp
Normal file
@@ -0,0 +1,83 @@
|
||||
#ifdef ENABLE_FUSE_APPLICATION
|
||||
|
||||
#include "FuseApplication.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
gflags::AllowCommandLineReparsing();
|
||||
using namespace hf3fs;
|
||||
return fuse::FuseApplication().run(argc, argv);
|
||||
}
|
||||
#else
|
||||
#include <folly/ScopeGuard.h>
|
||||
|
||||
#include "FuseConfig.h"
|
||||
#include "FuseMainLoop.h"
|
||||
#include "FuseOps.h"
|
||||
#include "common/logging/LogInit.h"
|
||||
|
||||
using namespace hf3fs;
|
||||
using namespace hf3fs::fuse;
|
||||
|
||||
DECLARE_string(cfg);
|
||||
DECLARE_bool(use_local_cfg);
|
||||
|
||||
auto withRetry(auto &&f, std::string_view desc) {
|
||||
using RetType = decltype(f());
|
||||
auto retryInterval = std::chrono::milliseconds(10);
|
||||
constexpr auto maxRetryInterval = std::chrono::milliseconds(1000);
|
||||
std::optional<RetType> res;
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
res = f();
|
||||
if (*res) break;
|
||||
XLOGF(CRITICAL, "{} failed: {}\nretryCount: {}", desc, res->error(), i);
|
||||
std::this_thread::sleep_for(retryInterval);
|
||||
retryInterval = std::min(2 * retryInterval, maxRetryInterval);
|
||||
}
|
||||
return *res;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
gflags::AllowCommandLineReparsing();
|
||||
FuseConfig hf3fsConfig;
|
||||
hf3fsConfig.init(&argc, &argv);
|
||||
|
||||
auto ibResult = net::IBManager::start(hf3fsConfig.ib_devices());
|
||||
XLOGF_IF(FATAL, !ibResult, "Failed to start IBManager: {}", ibResult.error());
|
||||
SCOPE_EXIT { hf3fs::net::IBManager::stop(); };
|
||||
|
||||
auto logConfigStr = logging::generateLogConfig(hf3fsConfig.log(), String("hf3fs_fuse"));
|
||||
XLOGF(INFO, "LogConfig: {}", logConfigStr);
|
||||
logging::initOrDie(logConfigStr);
|
||||
XLOGF(INFO, "{}", VersionInfo::full());
|
||||
|
||||
auto monitorResult = monitor::Monitor::start(hf3fsConfig.monitor());
|
||||
XLOGF_IF(FATAL, !monitorResult, "Parse config file from flags failed: {}", monitorResult.error());
|
||||
|
||||
auto physicalHostnameRes = SysResource::hostname(/*physicalMachineName=*/true);
|
||||
XLOGF_IF(FATAL, !physicalHostnameRes, "Get physical hostname failed: {}", physicalHostnameRes.error());
|
||||
|
||||
auto containerHostnameRes = SysResource::hostname(/*physicalMachineName=*/false);
|
||||
XLOGF_IF(FATAL, !containerHostnameRes, "Get container hostname failed: {}", containerHostnameRes.error());
|
||||
|
||||
auto clientId = ClientId::random(*physicalHostnameRes);
|
||||
|
||||
flat::AppInfo appInfo;
|
||||
appInfo.clusterId = hf3fsConfig.cluster_id();
|
||||
appInfo.hostname = *physicalHostnameRes;
|
||||
appInfo.pid = SysResource::pid();
|
||||
appInfo.releaseVersion = flat::ReleaseVersion::fromVersionInfo();
|
||||
|
||||
auto &d = getFuseClientsInstance();
|
||||
if (auto res = d.init(appInfo, hf3fsConfig.mountpoint(), hf3fsConfig.token_file(), hf3fsConfig); !res) {
|
||||
XLOGF(FATAL, "Init fuse clients failed: {}", res.error());
|
||||
}
|
||||
SCOPE_EXIT { d.stop(); };
|
||||
|
||||
return fuseMainLoop(argv[0],
|
||||
hf3fsConfig.allow_other(),
|
||||
hf3fsConfig.mountpoint(),
|
||||
hf3fsConfig.io_bufs().max_buf_size(),
|
||||
hf3fsConfig.cluster_id());
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user