Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

2
src/meta/CMakeLists.txt Normal file
View File

@@ -0,0 +1,2 @@
target_add_lib(meta core-app core-user core-service fdb meta-fbs mgmtd-client storage-client memory-common analytics)
target_add_bin(meta_main "meta.cpp" meta jemalloc)

133
src/meta/base/Config.h Normal file
View File

@@ -0,0 +1,133 @@
#pragma once
#include "analytics/StructuredTraceLog.h"
#include "client/storage/StorageClient.h"
#include "common/kv/TransactionRetry.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Duration.h"
#include "common/utils/PriorityCoroutinePool.h"
#include "common/utils/Size.h"
#include "core/user/UserCache.h"
#include "meta/components/Distributor.h"
#include "meta/components/Forward.h"
#include "meta/components/SessionManager.h"
#include "meta/event/Event.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
using kv::TransactionRetry;
struct GcConfig : ConfigBase<GcConfig> {
CONFIG_HOT_UPDATED_ITEM(enable, true);
CONFIG_HOT_UPDATED_ITEM(scan_interval, 200_ms);
CONFIG_HOT_UPDATED_ITEM(scan_batch, 4096);
CONFIG_HOT_UPDATED_ITEM(remove_chunks_batch_size, 32);
CONFIG_HOT_UPDATED_ITEM(gc_file_delay, 5_min);
CONFIG_HOT_UPDATED_ITEM(gc_file_concurrent, 32ul);
CONFIG_HOT_UPDATED_ITEM(gc_directory_delay, 0_s);
CONFIG_HOT_UPDATED_ITEM(gc_directory_concurrent, 4ul);
CONFIG_HOT_UPDATED_ITEM(gc_directory_entry_batch, 32ul);
CONFIG_HOT_UPDATED_ITEM(gc_directory_entry_concurrent, 4ul);
CONFIG_HOT_UPDATED_ITEM(retry_delay, 10_min);
// disable gc delay if free space is below 5%
CONFIG_HOT_UPDATED_ITEM(gc_delay_free_space_threshold, 5);
CONFIG_HOT_UPDATED_ITEM(check_session, true);
CONFIG_HOT_UPDATED_ITEM(distributed_gc, true); // random select a GC directory
CONFIG_HOT_UPDATED_ITEM(txn_low_priority, false);
// small file or large file
CONFIG_HOT_UPDATED_ITEM(small_file_chunks, (uint64_t)32);
CONFIG_HOT_UPDATED_ITEM(large_file_chunks, (uint64_t)128);
CONFIG_HOT_UPDATED_ITEM(recursive_perm_check, true);
CONFIG_OBJ(workers, PriorityCoroutinePoolConfig, [](auto &c) {
c.set_coroutines_num(8);
c.set_queue_size(1024);
});
CONFIG_OBJ(retry_remove_chunks, storage::client::RetryOptions, [](auto &c) {
c.set_init_wait_time(10_s);
c.set_max_wait_time(10_s);
c.set_max_retry_time(30_s);
});
};
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(readonly, false);
CONFIG_HOT_UPDATED_ITEM(authenticate, false);
CONFIG_HOT_UPDATED_ITEM(grv_cache, false);
CONFIG_OBJ(gc, GcConfig);
CONFIG_OBJ(session_manager, SessionManager::Config);
CONFIG_OBJ(distributor, Distributor::Config);
CONFIG_OBJ(forward, Forward::Config);
CONFIG_OBJ(event_trace_log, analytics::StructuredTraceLog<MetaEventTrace>::Config);
CONFIG_HOT_UPDATED_ITEM(max_symlink_depth, 4L, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(max_symlink_count, 10L, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(max_directory_depth, 64L, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(acl_cache_time, 15_s);
CONFIG_HOT_UPDATED_ITEM(list_default_limit, 128);
CONFIG_HOT_UPDATED_ITEM(sync_on_prune_session, false);
CONFIG_HOT_UPDATED_ITEM(max_remove_chunks_per_request, 32u, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(allow_stat_deleted_inodes, true);
CONFIG_HOT_UPDATED_ITEM(ignore_length_hint, false);
CONFIG_HOT_UPDATED_ITEM(time_granularity, 1_s);
CONFIG_HOT_UPDATED_ITEM(dynamic_stripe, false);
CONFIG_HOT_UPDATED_ITEM(dynamic_stripe_initial, 16u, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(dynamic_stripe_growth, 2u);
CONFIG_HOT_UPDATED_ITEM(batch_stat_concurrent, 8u);
CONFIG_HOT_UPDATED_ITEM(batch_stat_by_path_concurrent, 4u);
CONFIG_HOT_UPDATED_ITEM(max_batch_operations, 4096u);
CONFIG_HOT_UPDATED_ITEM(enable_new_chunk_engine, false);
CONFIG_HOT_UPDATED_ITEM(allow_owner_change_immutable, false);
// deperated
CONFIG_HOT_UPDATED_ITEM(check_file_hole, false);
CONFIG_OBJ(background_hole_checker, CoroutinesPoolBase::Config, [](CoroutinesPoolBase::Config &c) {
c.set_coroutines_num(16);
c.set_queue_size(4096);
});
CONFIG_HOT_UPDATED_ITEM(inodeId_check_unique, true);
CONFIG_HOT_UPDATED_ITEM(inodeId_abort_on_duplicate, false);
// replace file with new inode on O_TRUNC
CONFIG_HOT_UPDATED_ITEM(otrunc_replace_file, true);
CONFIG_HOT_UPDATED_ITEM(otrunc_replace_file_threshold, 1_GB);
// statfs
CONFIG_HOT_UPDATED_ITEM(statfs_cache_time, 60_s);
CONFIG_HOT_UPDATED_ITEM(statfs_update_interval, 5_s);
CONFIG_HOT_UPDATED_ITEM(statfs_space_imbalance_threshold, 5);
// iflags
CONFIG_HOT_UPDATED_ITEM(iflags_chain_allocation, false);
CONFIG_HOT_UPDATED_ITEM(iflags_chunk_engine, true);
// recursive remove
CONFIG_HOT_UPDATED_ITEM(recursive_remove_check_owner, true);
CONFIG_HOT_UPDATED_ITEM(recursive_remove_perm_check, (size_t)1024);
CONFIG_HOT_UPDATED_ITEM(allow_directly_move_to_trash, false);
// idempotent operation
CONFIG_HOT_UPDATED_ITEM(idempotent_record_expire, 30_min);
CONFIG_HOT_UPDATED_ITEM(idempotent_record_clean, 1_min);
CONFIG_HOT_UPDATED_ITEM(idempotent_remove, true);
CONFIG_HOT_UPDATED_ITEM(idempotent_rename, false);
CONFIG_HOT_UPDATED_ITEM(operation_timeout, 5_s);
CONFIG_OBJ(retry_transaction, TransactionRetry);
CONFIG_OBJ(retry_remove_chunks, storage::client::RetryOptions, [](auto &c) {
c.set_init_wait_time(10_s);
c.set_max_wait_time(10_s);
c.set_max_retry_time(30_s);
});
CONFIG_OBJ(user_cache, core::UserCache::Config);
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,79 @@
#pragma once
#include <array>
#include <folly/Random.h>
#include <folly/Synchronized.h>
#include <folly/container/EvictingCacheMap.h>
#include <optional>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
namespace hf3fs::meta::server {
class AclCache {
public:
AclCache(size_t cacheSize) {
for (size_t i = 0; i < kNumShards; i++) {
shardedMaps_.emplace_back(
folly::EvictingCacheMap<InodeId, CacheEntry>(std::max(cacheSize / kNumShards, 1ul << 10), 128));
}
}
std::optional<Acl> get(InodeId inode, Duration ttl) {
static monitor::CountRecorder hit("meta_server.aclcache_hit");
static monitor::CountRecorder miss("meta_server.aclcache_miss");
if (ttl.count() == 0) {
return std::nullopt;
}
std::optional<CacheEntry> cached;
{
auto &shard = getShard(inode);
auto guard = shard.lock();
auto iter = guard->find(inode);
if (iter != guard->end()) {
cached = iter->second;
}
}
if (!cached.has_value()) {
miss.addSample(1);
return std::nullopt;
}
auto deadline = cached->timestamp + ttl * folly::Random::randDouble(0.8, 1.0);
if (deadline < SteadyClock::now()) {
miss.addSample(1);
return std::nullopt;
}
hit.addSample(1);
return cached->acl;
}
void set(InodeId inode, Acl acl) {
auto &shard = getShard(inode);
shard.lock()->set(inode, {SteadyClock::now(), acl});
}
void invalid(InodeId inode) { getShard(inode).lock()->erase(inode); }
private:
static constexpr auto kNumShards = 32u;
struct CacheEntry {
SteadyTime timestamp;
Acl acl;
};
using CacheMap = folly::Synchronized<folly::EvictingCacheMap<InodeId, CacheEntry>, std::mutex>;
CacheMap &getShard(InodeId inode) {
auto shardId = inode.u64() % kNumShards;
return shardedMaps_[shardId];
}
std::vector<CacheMap> shardedMaps_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,131 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <fmt/core.h>
#include <folly/Random.h>
#include <folly/Synchronized.h>
#include <folly/logging/xlog.h>
#include <map>
#include <optional>
#include <utility>
#include <vector>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "fbs/meta/Schema.h"
#include "fbs/mgmtd/MgmtdTypes.h"
namespace hf3fs::meta::server {
class ChainAllocator {
public:
ChainAllocator(std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient)
: mgmtdClient_(std::move(mgmtdClient)) {}
CoTryTask<void> checkLayoutValid(const Layout &layout) {
CO_RETURN_ON_ERROR(layout.valid(true));
if (!layout.empty()) {
auto routing = getRoutingInfo();
const auto &chains = layout.getChainIndexList();
for (auto index : chains) {
auto ref = flat::ChainRef{layout.tableId, layout.tableVersion, index};
if (auto chain = routing->getChain(ref); !chain) {
XLOGF(ERR, "Layout contains a not found ChainRef {}", ref);
co_return makeError(MetaCode::kInvalidFileLayout, fmt::format("{} not found", ref));
} else if (chain->targets.empty()) {
XLOGF(ERR, "Chain {} has no target", chain->chainId);
co_return makeError(MetaCode::kInvalidFileLayout, fmt::format("Chain {} has no target", chain->chainId));
}
}
}
co_return Void{};
}
CoTryTask<void> allocateChainsForLayout(Layout &layout) {
co_return co_await allocateChainsForLayout(layout, [&](size_t chainCnt) {
auto tableId = layout.tableId;
auto stripeSize = layout.stripeSize;
auto key = AllocType(tableId, stripeSize);
auto guard = roundRobin_.lock();
auto iter = guard->find(key);
if (iter == guard->end()) {
// start with random value
auto initial = folly::Random::rand32(chainCnt) / stripeSize * stripeSize;
iter = guard->insert({key, initial}).first;
}
auto res = (iter->second % chainCnt) + 1;
iter->second = (iter->second + stripeSize) % chainCnt;
return res;
});
}
CoTryTask<void> allocateChainsForLayout(Layout &layout, folly::Synchronized<uint32_t> &chainAllocCounter) {
co_return co_await allocateChainsForLayout(layout, [&](size_t chainCnt) {
auto guard = chainAllocCounter.wlock();
auto stripeSize = layout.stripeSize;
if (*guard == (uint32_t)-1) {
// start with random value
*guard = folly::Random::rand32(chainCnt) / stripeSize * stripeSize;
}
// add and return.
auto res = (*guard % chainCnt) + 1;
*guard = (*guard + stripeSize) % chainCnt;
return res;
});
}
CoTryTask<void> allocateChainsForLayout(Layout &layout, auto &&roundRobin) {
CO_RETURN_ON_ERROR(co_await checkLayoutValid(layout));
if (!layout.empty()) {
co_return Void{};
}
auto tableId = layout.tableId;
auto tableVersion = layout.tableVersion;
auto routing = getRoutingInfo();
const auto *table = routing->raw()->getChainTable(tableId, tableVersion);
if (!table) {
XLOGF(ERR, "Failed to find ChainTable with {} and {}", tableId, tableVersion);
co_return makeError(MetaCode::kInvalidFileLayout,
fmt::format("ChainTable with {} and {} not found", tableId, tableVersion));
} else if (!table->chainTableVersion) {
XLOGF(ERR, "Invalid table {} version {}", tableId, tableVersion);
co_return makeError(MetaCode::kInvalidFileLayout,
fmt::format("Invalid chain table {} version {}", tableId, tableVersion));
}
auto chainCnt = table->chains.size();
if (chainCnt < layout.stripeSize || chainCnt == 0) {
XLOGF(ERR,
"Failed to allocate for layout {}, chain table {} have only {} chains.",
layout,
tableId.toUnderType(),
chainCnt);
co_return makeError(
MetaCode::kInvalidFileLayout,
fmt::format("try to allocate {} chains from {}, found {}", layout.stripeSize, tableId, chainCnt));
}
auto chainBegin = roundRobin(chainCnt);
layout.tableVersion = table->chainTableVersion;
layout.chains = Layout::ChainRange(chainBegin, Layout::ChainRange::STD_SHUFFLE_MT19937, folly::Random::rand64());
if (auto valid = layout.valid(false); valid.hasError()) {
XLOGF(DFATAL, "Layout is not valid after alloc {}, error {}", layout, valid.error());
CO_RETURN_ERROR(valid);
}
co_return Void{};
}
private:
std::shared_ptr<client::RoutingInfo> getRoutingInfo() { return mgmtdClient_->getRoutingInfo(); }
using AllocType = std::pair<flat::ChainTableId, size_t>;
folly::Synchronized<std::map<AllocType, uint32_t>, std::mutex> roundRobin_;
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,321 @@
#include "Distributor.h"
#include <algorithm>
#include <array>
#include <climits>
#include <cstdint>
#include <cstring>
#include <folly/Random.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/functional/Partial.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
#include "common/app/NodeId.h"
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/kv/WithTransaction.h"
#include "common/monitor/Recorder.h"
#include "common/serde/Serde.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Coroutine.h"
#include "common/utils/MurmurHash3.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Utils.h"
#include "fdb/FDBRetryStrategy.h"
#include "fdb/FDBTransaction.h"
#include "fmt/core.h"
#include "fmt/format.h"
#define FMT_KEY(key) fmt::join((const uint8_t *)(key).data(), (const uint8_t *)(key).data() + (key).size(), ",")
namespace hf3fs::meta::server {
namespace {
monitor::CountRecorder setMapCounter("meta_server.dist_set_map");
} // namespace
std::string Distributor::PerServerKey::pack(flat::NodeId nodeId) {
return fmt::format("{}-{:08d}", kPrefix, nodeId.toUnderType());
}
flat::NodeId Distributor::PerServerKey::unpack(std::string_view key) {
uint32_t nodeId;
auto fmt = fmt::format("{}-{{}}", kPrefix);
auto ret = scn::scan(key, fmt, nodeId);
if (!ret) {
return flat::NodeId(0);
} else {
return flat::NodeId(nodeId);
}
}
void Distributor::start(CPUExecutorGroup &exec) {
auto result = folly::coro::blockingWait(update(false));
XLOGF_IF(ERR, result.hasError(), "failed to update server map on start, error {}", result.error());
bgRunner_ = std::make_unique<BackgroundRunner>(&exec.randomPick());
bgRunner_->start(
fmt::format("distributor_update@{}", nodeId_),
[this]() -> CoTask<void> {
auto result = co_await update(false);
XLOGF_IF(CRITICAL, result.hasError(), "Distributor update failed, {}", result.error());
},
[&]() { return config_.update_interval() * folly::Random::randDouble(0.8, 1.2); });
}
void Distributor::stopAndJoin(bool updateMap) {
XLOGF(INFO, "{} stop, update map {}", nodeId_, updateMap);
if (bgRunner_) {
folly::coro::blockingWait(bgRunner_->stopAll());
bgRunner_.reset();
}
if (updateMap) {
XLOGF(INFO, "{} update map on stop", nodeId_);
auto result = folly::coro::blockingWait(update(true));
XLOGF_IF(ERR, result.hasError(), "failed to update server map on stop, error {}", result.error());
}
XLOGF(INFO, "{} stopped", nodeId_);
}
flat::NodeId Distributor::getServer(InodeId inodeId) {
auto guard = latest_.rlock();
return Weight::select(guard->active, inodeId);
}
CoTryTask<std::pair<bool, kv::Versionstamp>> Distributor::checkOnServer(kv::IReadWriteTransaction &txn,
InodeId inodeId) {
co_return co_await checkOnServer(txn, inodeId, nodeId_);
}
CoTryTask<std::pair<bool, kv::Versionstamp>> Distributor::checkOnServer(kv::IReadWriteTransaction &txn,
InodeId inodeId,
flat::NodeId nodeId) {
auto versionstamp = co_await loadVersion(txn);
CO_RETURN_ON_ERROR(versionstamp);
auto rlock = latest_.rlock();
if (*versionstamp > rlock->versionstamp) {
rlock.unlock();
CO_RETURN_ON_ERROR(co_await loadServerMap(txn, false));
rlock = latest_.rlock();
}
XLOGF_IF(FATAL, *versionstamp > rlock->versionstamp, "{} > {}", FMT_KEY(*versionstamp), FMT_KEY(rlock->versionstamp));
if (*versionstamp < rlock->versionstamp) {
XLOGF(WARN, "version {} < {}, need retry", FMT_KEY(*versionstamp), FMT_KEY(rlock->versionstamp));
co_return makeError(TransactionCode::kTooOld, "distributor versionstamp changed");
}
auto server = Weight::select(rlock->active, inodeId);
co_return std::pair(server == nodeId, *versionstamp);
}
CoTryTask<kv::Versionstamp> Distributor::loadVersion(kv::IReadOnlyTransaction &txn) {
co_return (co_await txn.get(kv::kMetadataVersionKey)).then([](auto &value) {
if (!value.has_value()) {
return kv::Versionstamp{0};
}
auto version = kv::Versionstamp{0};
XLOGF_IF(FATAL,
value->size() != version.size(),
"kMetadataVersionKey -> value {}, size not match",
FMT_KEY(*value));
memcpy(version.data(), value->data(), version.size());
return version;
});
}
CoTryTask<void> Distributor::updateVersion(kv::IReadWriteTransaction &txn) {
std::array<char, sizeof(kv::Versionstamp)> buf{0};
co_return co_await txn.setVersionstampedValue(kv::kMetadataVersionKey, {buf.data(), buf.size()}, 0);
}
CoTryTask<Distributor::LatestServerMap> Distributor::loadServerMap(kv::IReadOnlyTransaction &txn, bool update) {
auto versionstamp = co_await loadVersion(txn);
CO_RETURN_ON_ERROR(versionstamp);
auto load = co_await txn.get(kMapKey);
CO_RETURN_ON_ERROR(load);
XLOGF_IF(DFATAL,
(!load->has_value() && updated_ != 0),
"{} distributor server map not found, shouldn't happen",
nodeId_);
ServerMap map;
if (load->has_value()) {
auto des = serde::deserialize(map, **load);
if (des.hasError()) {
XLOGF(DFATAL, "Failed to deserializa server map, {}", des.error());
co_return makeError(MetaCode::kInconsistent, "Invalid distributor server map");
}
} else {
XLOGF(INFO, "server map not found");
}
if (*versionstamp <= latest_.rlock()->versionstamp) {
co_return LatestServerMap{map, *versionstamp};
}
{
auto wlock = latest_.wlock();
if (*versionstamp > wlock->versionstamp) {
XLOGF(INFO,
"{} get new server map: {}, versionstamp: {}, update {}",
nodeId_,
map,
FMT_KEY(*versionstamp),
update);
*wlock = {map, *versionstamp};
}
}
co_return LatestServerMap{map, *versionstamp};
}
CoTryTask<void> Distributor::updateServerMap(kv::IReadWriteTransaction &txn, const Distributor::ServerMap &map) {
XLOGF(INFO, "{} try set new server map: {}", nodeId_, map);
setMapCounter.addSample(1);
auto key = kMapKey;
auto value = serde::serialize(map);
CO_RETURN_ON_ERROR(co_await txn.set(key, value));
CO_RETURN_ON_ERROR(co_await updateVersion(txn));
co_return Void{};
}
CoTryTask<void> Distributor::update(bool exit) {
for (size_t i = 0; i < 10; i++) {
auto strategy = kv::FDBRetryStrategy();
auto result = co_await kv::WithTransaction<kv::FDBRetryStrategy>(strategy).run(
kvEngine_->createReadWriteTransaction(),
[&](kv::IReadWriteTransaction &txn) -> CoTryTask<bool> { co_return co_await update(txn, exit); });
XLOGF_IF(ERR, result.hasError(), "{} update failed, error {}", nodeId_, result.error());
CO_RETURN_ON_ERROR(result);
// if update generate a new map, we need update again to load it.
auto newMap = *result;
XLOGF(INFO, "{} updated map, new {}, exit {}", nodeId_, newMap, exit);
if (!newMap || exit) {
co_return Void{};
}
updated_.fetch_add(1);
}
XLOGF(CRITICAL, "{} update not finished after too many times", nodeId_);
co_return makeError(MetaCode::kBusy, "update not finished after too many times");
}
CoTryTask<bool> Distributor::update(kv::IReadWriteTransaction &txn, bool exit) {
XLOGF(INFO, "{} update, exit {}", nodeId_, exit);
auto current = co_await loadServerMap(txn, true);
CO_RETURN_ON_ERROR(current);
auto startCheck = SteadyClock::now();
{
auto rlock = latest_.rlock();
XLOGF_IF(FATAL,
current->versionstamp > rlock->versionstamp,
"{} > {}",
FMT_KEY(current->versionstamp),
FMT_KEY(rlock->versionstamp));
if (current->versionstamp < rlock->versionstamp) {
XLOGF(WARN, "version {} < {}, need retry", FMT_KEY(current->versionstamp), FMT_KEY(rlock->versionstamp));
co_return makeError(TransactionCode::kTooOld, "distributor versionstamp changed");
} else {
XLOGF_IF(DFATAL,
current->active != rlock->active,
"versionstamp {}, {} != {}",
FMT_KEY(current->versionstamp),
fmt::join(current->active.begin(), current->active.end(), ","),
fmt::join(rlock->active.begin(), rlock->active.end(), ","));
}
}
auto opts = kv::TransactionHelper::ListByPrefixOptions().withSnapshot(true).withInclusive(false).withLimit(0);
auto result = co_await kv::TransactionHelper::listByPrefix(txn, fmt::format("{}-", kPrefix), opts);
CO_RETURN_ON_ERROR(result);
std::set<flat::NodeId> dead;
servers_.withWLock([&](auto &servers) {
bool self = false;
for (auto &[key, versionstamp] : *result) {
auto nodeId = PerServerKey::unpack(key);
if (!nodeId) {
XLOGF(DFATAL, "Failed to unpack key {}", key);
continue;
} else if (nodeId == nodeId_) {
self = true;
continue;
}
if (!servers.contains(nodeId) || servers[nodeId].versionstamp != versionstamp) {
XLOGF(INFO,
"{} found {} alive, prev {}, curr {}",
nodeId_,
nodeId,
FMT_KEY(servers[nodeId].versionstamp),
FMT_KEY(versionstamp));
servers[nodeId] = {versionstamp, SteadyClock::now()};
}
}
XLOGF_IF(DFATAL, (updated_ != 0 && !self), "self {} not found!!!", nodeId_);
auto timeout = config_.timeout();
for (auto nodeId : current->active) {
auto state = servers[nodeId];
if (nodeId != nodeId_ && state.lastUpdate + timeout < startCheck) {
XLOGF(CRITICAL, "{} mark {} as dead, not update in {}", nodeId_, nodeId, timeout);
dead.emplace(nodeId);
}
}
});
auto key = PerServerKey::pack(nodeId_);
std::array<char, sizeof(kv::Versionstamp)> buf{0};
CO_RETURN_ON_ERROR(co_await txn.setVersionstampedValue(key, {buf.data(), buf.size()}, 0));
bool update = false;
if (!exit && std::find(current->active.begin(), current->active.end(), nodeId_) == current->active.end()) {
XLOGF(INFO, "{} not in server map, create a new map", nodeId_);
update = true;
}
if (!dead.empty()) {
XLOGF(INFO, "{} found dead servers {}, create a new map", nodeId_, fmt::join(dead.begin(), dead.end(), ","));
update = true;
}
if (exit) {
XLOGF(INFO, "{} exiting, create a new map", nodeId_);
dead.insert(nodeId_);
update = true;
}
if (!update) {
co_return false;
}
std::set<flat::NodeId> active;
if (current) {
for (auto node : current->active) {
if (!dead.contains(node)) {
active.insert(node);
}
}
}
if (!exit) {
active.insert(nodeId_);
}
ServerMap map{std::vector<flat::NodeId>(active.begin(), active.end())};
CO_RETURN_ON_ERROR(co_await updateServerMap(txn, map));
co_return true;
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,98 @@
#pragma once
#include <atomic>
#include <folly/Synchronized.h>
#include <folly/logging/xlog.h>
#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <regex.h>
#include <string>
#include <vector>
#include "common/app/NodeId.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/serde/Serde.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
class Distributor {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(update_interval, 1_s);
CONFIG_HOT_UPDATED_ITEM(timeout, 30_s);
};
Distributor(const Config &config, flat::NodeId nodeId, std::shared_ptr<kv::IKVEngine> kvEngine)
: config_(config),
nodeId_(nodeId),
kvEngine_(kvEngine) {
XLOGF_IF(FATAL, !nodeId_, "invalid node id {}", nodeId_);
}
~Distributor() { stopAndJoin(); }
flat::NodeId nodeId() const { return nodeId_; }
void start(CPUExecutorGroup &exec);
void stopAndJoin(bool updateMap = true);
flat::NodeId getServer(InodeId inodeId);
CoTryTask<std::pair<bool, kv::Versionstamp>> checkOnServer(kv::IReadWriteTransaction &txn, InodeId inodeId);
CoTryTask<std::pair<bool, kv::Versionstamp>> checkOnServer(kv::IReadWriteTransaction &txn,
InodeId inodeId,
flat::NodeId nodeId);
private:
static constexpr auto kPrefix = kv::toStr(kv::KeyPrefix::MetaDistributor);
static constexpr auto kMapKey = kPrefix;
struct ServerMap {
SERDE_STRUCT_FIELD(active, std::vector<flat::NodeId>());
};
struct LatestServerMap : ServerMap {
kv::Versionstamp versionstamp{0};
};
struct ServerStatus {
std::string versionstamp;
SteadyTime lastUpdate;
};
struct PerServerKey {
static std::string pack(flat::NodeId nodeId);
static flat::NodeId unpack(std::string_view key);
};
CoTryTask<kv::Versionstamp> loadVersion(kv::IReadOnlyTransaction &txn);
CoTryTask<LatestServerMap> loadServerMap(kv::IReadOnlyTransaction &txn, bool update);
CoTryTask<void> updateVersion(kv::IReadWriteTransaction &txn);
CoTryTask<void> updateServerMap(kv::IReadWriteTransaction &txn, const ServerMap &map);
CoTryTask<void> update(bool exit);
CoTryTask<bool> update(kv::IReadWriteTransaction &txn, bool exit);
const Config config_;
flat::NodeId nodeId_;
std::atomic<size_t> updated_{0};
std::shared_ptr<kv::IKVEngine> kvEngine_;
std::unique_ptr<BackgroundRunner> bgRunner_;
folly::Synchronized<LatestServerMap> latest_;
folly::Synchronized<std::map<flat::NodeId, ServerStatus>> servers_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,246 @@
#include "meta/components/FileHelper.h"
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <fmt/core.h>
#include <folly/Likely.h>
#include <folly/Math.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/futures/Future.h>
#include <folly/logging/xlog.h>
#include <linux/fs.h>
#include <map>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include "client/storage/StorageClient.h"
#include "common/app/NodeId.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/FileOperation.h"
#include "fbs/meta/Schema.h"
#define GET_RAW_ROUTING_INFO() \
auto routingInfo = mgmtdClient_->getRoutingInfo(); \
if (!routingInfo || !routingInfo->raw()) { \
XLOGF(ERR, "RoutingInfo not ready"); \
co_return makeError(MgmtdClientCode::kRoutingInfoNotReady); \
} \
auto rawRoutingInfo = routingInfo->raw()
namespace hf3fs::meta::server {
namespace {
FileOperation::Recorder recorder("meta_server");
template <typename T>
constexpr folly::ordering compare(const T &a, const T &b) {
return a < b ? folly::ordering::lt : a > b ? folly::ordering::gt : folly::ordering::eq;
}
auto findMinMax(auto &&map) {
auto min = map.begin();
auto max = map.begin();
for (auto iter = std::next(map.begin()); iter != map.end(); iter++) {
if (min->second > iter->second) {
min = iter;
}
if (max->second < iter->second) {
max = iter;
}
}
return std::make_pair(min, max);
}
} // namespace
void FileHelper::start(CPUExecutorGroup &exec) {
bgRunner_ = std::make_unique<BackgroundRunner>(&exec.pickNext());
bgRunner_->start(
"statFs",
[&]() -> CoTask<void> {
auto cached = *cachedFsStatus_.rlock();
if (cached.status_.has_value() && RelativeTime::now() - cached.update_ < config_.statfs_update_interval() &&
RelativeTime::now() - cached.update_ < config_.statfs_cache_time()) {
// don't need update statFs
co_return;
}
co_await updateStatFs();
},
[]() { return 200_ms; });
}
void FileHelper::stopAndJoin() {
if (bgRunner_) {
folly::coro::blockingWait(bgRunner_->stopAll());
bgRunner_.reset();
}
}
CoTryTask<uint64_t> FileHelper::queryLength(const UserInfo &userInfo, const Inode &inode, bool *hasHole) {
GET_RAW_ROUTING_INFO();
FileOperation fop(*storageClient_, *rawRoutingInfo, userInfo, inode, recorder);
auto queryResult = co_await fop.queryChunks(hasHole != nullptr, config_.dynamic_stripe());
CO_RETURN_ON_ERROR(queryResult);
#define QUERY_DETAIL \
"file {}, length {}, chunk num {}, total chunk length {}, total chunk num {}", inode.id, queryResult->length, \
chunkNum, queryResult->totalChunkLen, queryResult->totalNumChunks
if (hasHole != nullptr) {
auto chunkNum = folly::divCeil(queryResult->length, inode.asFile().layout.chunkSize.u64());
*hasHole = queryResult->length != queryResult->totalChunkLen || chunkNum != queryResult->totalNumChunks;
if (*hasHole) {
XLOGF(ERR, "FileHelper found hole in " QUERY_DETAIL);
} else {
XLOGF(DBG, "FileHelper check hole for " QUERY_DETAIL);
}
}
XLOGF(DBG, "FileHelper query length for {}, {}.", inode.id, queryResult->length);
#undef QUERY_DETAIL
co_return queryResult->length;
}
CoTryTask<size_t> FileHelper::remove(const UserInfo &userInfo,
const Inode &inode,
RetryOptions retry,
uint32_t removeChunksBatchSize) {
if (inode.acl.iflags & FS_IMMUTABLE_FL) {
auto msg = fmt::format("try remove file {} with FS_IMMUTABLE_FL", inode.id);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kFoundBug, msg);
}
GET_RAW_ROUTING_INFO();
FileOperation fop(*storageClient_, *rawRoutingInfo, userInfo, inode, recorder);
size_t total = 0;
while (true) {
auto result = co_await fop.removeChunks(0, removeChunksBatchSize, config_.dynamic_stripe(), retry);
CO_RETURN_ON_ERROR(result);
auto [removed, more] = *result;
total += removed;
if (!more) {
break;
}
XLOGF(DBG, "File {} has more chunks to remove after removed {} chunks", inode.id, removed);
}
co_return total;
}
CoTryTask<FsStatus> FileHelper::statFs(const UserInfo &userInfo, std::chrono::milliseconds cacheDuration) {
auto cached = *cachedFsStatus_.rlock();
if (!cached.status_.has_value() || RelativeTime::now() - cached.update_ > config_.statfs_cache_time()) {
if (UNLIKELY(!bgRunner_)) {
XLOGF(DFATAL, "FileHelper not started");
co_return makeError(MetaCode::kFoundBug, "FileHelper not started!");
}
co_return makeError(StorageClientCode::kResourceBusy, "cached statfs outdate, try again");
}
co_return *cached.status_;
}
CoTryTask<void> FileHelper::updateStatFs() {
static constexpr double kTiB = 1ULL << 40;
std::vector<folly::SemiFuture<Result<storage::SpaceInfoRsp>>> reqs;
auto nodes = mgmtdClient_->getRoutingInfo()->getNodeBy(flat::selectNodeByType(flat::NodeType::STORAGE) &&
flat::selectActiveNode());
for (auto &node : nodes) {
auto req = storageClient_->querySpaceInfo(storage::NodeId(node.app.nodeId)).semi();
reqs.push_back(std::move(req));
}
uint64_t cap = 0;
uint64_t free = 0;
auto results = co_await folly::coro::collectAllRange(std::move(reqs));
std::map<flat::NodeId, size_t> nodesFree;
std::map<std::pair<flat::NodeId, std::string>, size_t> pathFree;
for (size_t i = 0; i < nodes.size(); i++) {
const auto &node = nodes.at(i);
const auto &result = results.at(i);
if (result.hasError()) {
XLOGF(ERR, "FileHelper statFs: failed to querySpaceInfo of {}, error {}", node, result.error());
continue;
}
const auto &rsp = *result;
for (const auto &space : rsp.spaceInfos) {
XLOGF(DBG,
"FileHelper statFs: node {}, path {}, cap {:.1f}TiB, free {:.1f}TiB",
node.app.nodeId,
space.path,
space.capacity / kTiB,
space.free / kTiB);
cap += space.capacity;
free += std::min(space.free, space.capacity);
nodesFree[node.app.nodeId] += space.free;
pathFree[std::make_pair(node.app.nodeId, space.path)] = space.free;
}
}
// log space info
if (!pathFree.empty()) {
auto status = cachedFsStatus_.rlock()->status_;
if (status) {
XLOGF(INFO,
"FileHelper statFs: cap {:.1f}TiB, free {:.1f}TiB, prev free {:.1f}TiB, free diff {:.1f}TiB",
cap / kTiB,
free / kTiB,
status->free / kTiB,
((int64_t)free - (int64_t)status->free) / kTiB);
} else {
XLOGF(INFO, "FileHelper statFs: cap {:.1f}TiB, free {:.1f}TiB", cap / kTiB, free / kTiB);
}
auto threshold = config_.statfs_space_imbalance_threshold();
auto [minNode, maxNode] = findMinMax(nodesFree);
auto avgNodeCap = cap / nodesFree.size();
auto nodeDiff = (double)(maxNode->second - minNode->second) / avgNodeCap * 100.0;
auto nodeMsg = fmt::format("{} {:.1f}TiB free, {} {:.1f}TiB free, avgNodeCap {:.1f}TiB, diff {:.3f}%",
minNode->first,
minNode->second / kTiB,
maxNode->first,
maxNode->second / kTiB,
avgNodeCap / kTiB,
nodeDiff);
if (nodeDiff > threshold) {
XLOGF(WARN, "FileHelper statFs: node space utilization imbalance, {}", nodeMsg);
} else {
XLOGF(INFO, "FileHelper statFs: {}", nodeMsg);
}
auto [minPath, maxPath] = findMinMax(pathFree);
auto avgPathCap = cap / pathFree.size();
auto pathDiff = (double)(maxPath->second - minPath->second) / avgPathCap * 100.0;
auto pathMsg = fmt::format("{}:{} {:.1f}TiB free, {}:{} {:.1f}TiB free, avgPathCap {:.1f}TiB, diff {:.3f}%",
minPath->first.first,
minPath->first.second,
minPath->second / kTiB,
maxPath->first.first,
maxPath->first.second,
maxPath->second / kTiB,
avgPathCap / kTiB,
pathDiff);
if (pathDiff > threshold) {
XLOGF(WARN, "FileHelper statFs: disk space utilization imbalance, {}", pathMsg);
} else {
XLOGF(INFO, "FileHelper statFs: {}", pathMsg);
}
}
auto guard = cachedFsStatus_.wlock();
guard->update_ = RelativeTime::now();
guard->status_ = FsStatus(cap, cap - free, free);
co_return Void{};
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,73 @@
#pragma once
#include <chrono>
#include <fmt/core.h>
#include <folly/SharedMutex.h>
#include <folly/Synchronized.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Mutex.h>
#include <folly/experimental/coro/SharedMutex.h>
#include <memory>
#include <optional>
#include <utility>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/storage/StorageClient.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "meta/base/Config.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
using FsStatus = StatFsRsp;
class FileHelper {
public:
using RetryOptions = storage::client::RetryOptions;
FileHelper(const Config &config,
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient,
std::shared_ptr<storage::client::StorageClient> storageClient)
: config_(config),
mgmtdClient_(std::move(mgmtdClient)),
storageClient_(std::move(storageClient)) {}
~FileHelper() { stopAndJoin(); }
void start(CPUExecutorGroup &exec);
void stopAndJoin();
CoTryTask<uint64_t> queryLength(const UserInfo &userInfo, const Inode &inode, bool *hasHole = nullptr);
CoTryTask<size_t> remove(const UserInfo &userInfo,
const Inode &inode,
RetryOptions retry,
uint32_t removeChunksBatchSize);
CoTryTask<FsStatus> statFs(const UserInfo &userInfo, std::chrono::milliseconds cacheDuration);
std::optional<FsStatus> cachedFsStatus() const { return cachedFsStatus_.rlock()->status_; }
private:
CoTryTask<void> updateStatFs();
const Config &config_;
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient_;
std::shared_ptr<storage::client::StorageClient> storageClient_;
struct CachedFsStatus {
RelativeTime update_;
std::optional<FsStatus> status_;
};
std::unique_ptr<BackgroundRunner> bgRunner_;
folly::Synchronized<CachedFsStatus> cachedFsStatus_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,172 @@
#pragma once
#include <folly/logging/xlog.h>
#include <functional>
#include <memory>
#include <type_traits>
#include <variant>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/mgmtd/MgmtdClientForServer.h"
#include "common/app/NodeId.h"
#include "common/net/Client.h"
#include "common/net/RequestOptions.h"
#include "common/serde/CallContext.h"
#include "common/serde/ClientContext.h"
#include "common/serde/ClientMockContext.h"
#include "common/serde/MessagePacket.h"
#include "common/utils/Address.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/StatusCode.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fbs/meta/Utils.h"
#include "fmt/core.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
class Forward {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(debug, true);
CONFIG_HOT_UPDATED_ITEM(addr_type, net::Address::Type::RDMA);
CONFIG_HOT_UPDATED_ITEM(timeout, 10_s);
};
using NetClient = std::reference_wrapper<net::Client>;
using MockClient = std::reference_wrapper<std::map<flat::NodeId, serde::ClientMockContext>>;
Forward(const Config &config,
flat::NodeId nodeId,
std::variant<NetClient, MockClient> client,
std::shared_ptr<::hf3fs::client::ICommonMgmtdClient> mgmtdClient)
: config_(config),
nodeId_(nodeId),
client_(client),
mgmtdClient_(mgmtdClient) {
XLOGF_IF(FATAL, !nodeId_, "invalid nodeId {}", nodeId_);
}
template <typename Req, typename Rsp>
CoTryTask<Rsp> forward(flat::NodeId node, Req req) {
OperationRecorder::Guard record(OperationRecorder::server(), "forward", req.user.uid);
auto result = co_await forwardImpl<Req, Rsp>(node, std::move(req));
record.finish(result);
co_return result;
}
private:
template <typename Req, typename Rsp, typename Context>
struct ForwardMethod {};
template <typename Context>
struct ForwardMethod<SyncReq, SyncRsp, Context> {
static constexpr auto rpcMethod = MetaSerde<>::sync<Context>;
};
template <typename Context>
struct ForwardMethod<CloseReq, CloseRsp, Context> {
static constexpr auto rpcMethod = MetaSerde<>::close<Context>;
};
template <typename Context>
struct ForwardMethod<SetAttrReq, SetAttrRsp, Context> {
static constexpr auto rpcMethod = MetaSerde<>::setAttr<Context>;
};
template <typename Context>
struct ForwardMethod<CreateReq, CreateRsp, Context> {
static constexpr auto rpcMethod = MetaSerde<>::create<Context>;
};
template <typename Req>
Result<Void> check(flat::NodeId node, Req &req) {
if (!node) {
XLOGF(WARN, "request {}, unknown corresponding server, need retry", req);
return makeError(MetaCode::kForwardFailed, "unknown corresponding server");
}
if (req.forward) {
XLOGF_IF(INFO, config_.debug(), "request is forward from {}, can't forward again, req {}.", req.forward, req);
return makeError(MetaCode::kForwardFailed, "double forward, retry");
}
req.forward = nodeId_;
XLOGF_IF(INFO, config_.debug(), "forward req {} to {}", req, node);
XLOGF_IF(DBG, !config_.debug(), "forward req {} to {}", req, node);
XLOGF_IF(FATAL, nodeId_ == node, "forward to self, {} == {}", nodeId_, node);
return Void{};
}
CoTryTask<net::Address> getAddress(flat::NodeId node) {
auto routing = mgmtdClient_->getRoutingInfo();
if (!routing) {
co_return makeError(MetaCode::kForwardFailed, "routing info not ready, need retry");
}
auto *nodeInfo = routing->raw()->getNode(node);
if (!nodeInfo) {
auto msg = fmt::format("req forward: routing info doesn't contains node {}", node);
XLOG(WARN, msg);
co_return makeError(MetaCode::kForwardFailed, std::move(msg));
}
auto addrs = nodeInfo->extractAddresses("MetaSerde", config_.addr_type());
if (addrs.empty()) {
auto msg =
fmt::format("req forward: node {} doesn't have {} addr.", node, magic_enum::enum_name(config_.addr_type()));
XLOG(WARN, msg);
co_return makeError(MetaCode::kForwardFailed, std::move(msg));
}
co_return addrs.front();
}
template <typename Req, typename Rsp>
CoTryTask<Rsp> forwardImpl(flat::NodeId node, Req req) {
CO_RETURN_ON_ERROR(check(node, req));
auto opts = net::UserRequestOptions();
opts.timeout = config_.timeout();
opts.sendRetryTimes = 3;
opts.compression = std::nullopt;
Result<Rsp> result = makeError(MetaCode::kFoundBug);
if (std::holds_alternative<NetClient>(client_)) {
auto &client = std::get<NetClient>(client_);
auto addr = co_await getAddress(node);
CO_RETURN_ON_ERROR(addr);
auto ctx = client.get().serdeCtx(*addr);
result = co_await ForwardMethod<Req, Rsp, serde::ClientContext>::rpcMethod(ctx, req, &opts, nullptr);
} else {
auto &client = std::get<MockClient>(client_);
if (!client.get().contains(node)) {
co_return makeError(MetaCode::kForwardFailed, fmt::format("{} not found", node));
}
auto &ctx = client.get()[node];
result = co_await ForwardMethod<Req, Rsp, serde::ClientMockContext>::rpcMethod(ctx, req, &opts, nullptr);
}
if (result.hasError() && StatusCode::typeOf(result.error().code()) == StatusCodeType::RPC) {
XLOGF(ERR, "failed to forward req to {}, error {}", node, result.error());
co_return makeError(MetaCode::kForwardTimeout,
fmt::format("failed to forward req to {}, error {}", node, result.error()));
}
if (result.hasError()) {
XLOGF_IF(INFO, config_.debug(), "forward req {} to {}, rsp {}", req, node, result.error());
} else {
XLOGF_IF(INFO, config_.debug(), "forward req {} to {}, rsp {}", req, node, result.value());
}
co_return result;
}
const Config &config_;
flat::NodeId nodeId_;
std::variant<NetClient, MockClient> client_;
std::shared_ptr<::hf3fs::client::ICommonMgmtdClient> mgmtdClient_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,903 @@
#include "meta/components/GcManager.h"
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <fmt/core.h>
#include <folly/Likely.h>
#include <folly/Random.h>
#include <folly/ScopeGuard.h>
#include <folly/String.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Sleep.h>
#include <folly/experimental/coro/ViaIfAsync.h>
#include <folly/functional/Partial.h>
#include <folly/logging/xlog.h>
#include <limits>
#include <linux/fs.h>
#include <linux/limits.h>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "common/app/NodeId.h"
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/monitor/Recorder.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Coroutine.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Duration.h"
#include "common/utils/MagicEnum.hpp"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/SemaphoreGuard.h"
#include "common/utils/StatusCode.h"
#include "common/utils/SysResource.h"
#include "common/utils/UtcTime.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fdb/FDBTransaction.h"
#include "fmt/format.h"
#include "foundationdb/fdb_c_types.h"
#include "meta/components/FileHelper.h"
#include "meta/event/Event.h"
#include "meta/store/DirEntry.h"
#include "meta/store/FileSession.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
#include "meta/store/ops/SetAttr.h"
namespace hf3fs::meta::server {
static constexpr size_t kNumGcDirectoryPerServer = 5; // 4 + 1
namespace {
monitor::CountRecorder gcSuccCount("meta_server.gc_success");
monitor::CountRecorder gcFailCount("meta_server.gc_fail");
monitor::CountRecorder gcCritical("meta_server.gc_critical");
monitor::CountRecorder gcEnqueue("meta_server.gc_enqueue");
monitor::CountRecorder gcBusy("meta_server.gc_busy");
monitor::LatencyRecorder gcLatency("meta_server.gc_latency");
monitor::DistributionRecorder chunksDist("meta_server.gc_chunks");
} // namespace
/* GcManager::GcDirectory */
void GcManager::GcDirectory::start(GcManager &manager, CPUExecutorGroup &exec) {
std::vector<GcEntryType> types{GcEntryType::DIRECTORY,
GcEntryType::FILE_LARGE,
GcEntryType::FILE_MEDIUM,
GcEntryType::FILE_SMALL};
assert(types.size() == GcEntryType::MAX);
for (auto type : types) {
latch_.increase();
co_withCancellation(cancel_.getToken(), scan(manager, type)).scheduleOn(&exec.pickNext()).start();
}
}
void GcManager::GcDirectory::stopAndJoin() {
cancel_.requestCancellation();
folly::coro::blockingWait(latch_.wait());
}
CoTask<void> GcManager::GcDirectory::scan(GcManager &manager, GcEntryType type) {
SCOPE_EXIT { latch_.countDown(); };
std::optional<std::string> prev;
auto prevTime = SteadyClock::now();
while (true) {
auto wait = manager.config_.gc().scan_interval();
if (manager.config_.gc().enable()) {
if (auto now = SteadyClock::now(); now - prevTime > 5_s) {
prev = std::nullopt;
prevTime = now;
}
auto delay = type == GcEntryType::DIRECTORY ? manager.config_.gc().gc_directory_delay()
: manager.config_.gc().gc_file_delay();
auto result =
co_await scan(manager, type, manager.enableGcDelay() ? delay : 0_ms, manager.config_.gc().scan_batch(), prev);
if (!result) {
XLOGF(ERR, "GcDirectory {} scan {} failed, error {}", name(), magic_enum::enum_name(type), result.error());
} else if (!*result) {
wait = std::max(wait, 100_ms);
}
}
auto res = co_await folly::coro::co_awaitTry(folly::coro::sleep(wait.asUs()));
if (UNLIKELY(res.hasException())) {
XLOGF_IF(FATAL, !res.hasException<OperationCancelled>(), "Exception {}", res.exception().what());
XLOGF(INFO, "GcDirectory::scan {} {} exit", entry_.name, magic_enum::enum_name(type));
break;
}
}
}
CoTryTask<bool> GcManager::GcDirectory::scan(GcManager &manager,
GcEntryType type,
Duration delay,
size_t limit,
std::optional<std::string> &prev) {
auto &state = states_[type];
XLOGF(DBG, "GcDirectory {} scan {}, queued {}", name(), magic_enum::enum_name(type), state.counter.load());
auto prefix = prefixOf(type);
std::string prefixstr = fmt::format("{}", prefix);
std::string beginkey = prefixstr;
auto endtime = UtcTime::fromMicroseconds(UtcClock::now().toMicroseconds() - delay.asUs().count());
std::string endkey = formatGcEntry(prefix, endtime, InodeId::root());
XLOGF_IF(FATAL, beginkey >= endkey, "{} >= {}", beginkey, endkey);
// skip some keys
if (prev && *prev > beginkey) {
beginkey = std::min(*prev, endkey);
}
auto queued = state.queued.lock();
auto empty = true;
while (beginkey < endkey) {
auto cnt = state.counter.load();
if (cnt >= limit) {
XLOGF(DBG, "GcDirectory skip scan, queuedCnt {}", cnt);
co_return true;
}
auto finished = state.finished.withLock([](auto &v) { return std::exchange(v, {}); });
for (auto &inode : finished) {
queued->erase(inode);
}
auto txn = manager.kvEngine_->createReadonlyTransaction();
auto result = co_await DirEntryList::snapshotLoad(*txn, dirId(), beginkey, endkey, limit - cnt);
CO_RETURN_ON_ERROR(result);
for (auto &entry : result->entries) {
if (!queued->contains(entry.id)) {
queued->insert(entry.id);
state.counter++;
auto task = GcTask(shared_from_this(), type, entry);
manager.gcWorkers_->enqueue(std::move(task), priorityOf(type));
}
}
if (!result->entries.empty()) {
beginkey = result->entries.back().name;
prev = result->entries.back().name;
empty = false;
}
if (!result->more) {
break;
}
}
co_return !empty;
}
void GcManager::GcDirectory::finish(const GcManager::GcTask &task) {
XLOGF_IF(FATAL, task.type >= GcEntryType::MAX, "Invalid GcEntryType {}", (int)task.type);
auto &state = states_[task.type];
state.finished.lock()->insert(task.taskEntry.id);
auto cnt = state.counter--;
XLOGF_IF(FATAL, cnt < 0, "cnt {}", cnt);
}
CoTryTask<void> GcManager::GcDirectory::add(auto &txn, const Inode &inode, const GcConfig &config, GcInfo gcInfo) {
gcEnqueue.addSample(1);
switch (inode.getType()) {
case InodeType::File:
co_return co_await addFile(txn, inode, config);
case InodeType::Directory:
// only directory need gcInfo
co_return co_await addDirectory(txn, inode, gcInfo);
default:
XLOGF(FATAL, "Invalid inode type {}, inode {}", magic_enum::enum_name(inode.getType()), inode);
}
}
CoTryTask<void> GcManager::GcDirectory::addFile(auto &txn, const Inode &inode, const GcConfig &config) {
auto prefix = prefixOf(GcEntryType::FILE_MEDIUM);
auto chunks = inode.asFile().length / inode.asFile().layout.chunkSize;
if (chunks >= config.large_file_chunks()) {
prefix = prefixOf(GcEntryType::FILE_LARGE);
}
if (chunks < config.small_file_chunks()) {
prefix = prefixOf(GcEntryType::FILE_SMALL);
}
auto entry = DirEntry::newFile(dirId(), formatGcEntry(prefix, UtcClock::now(), inode.id), inode.id);
CO_RETURN_ON_ERROR(co_await entry.store(txn));
XLOGF(DBG, "GcManager create GC entry {}", entry);
co_return Void{};
}
CoTryTask<void> GcManager::GcDirectory::addDirectory(auto &txn, const Inode &inode, GcInfo gcInfo) {
auto prefix = prefixOf(GcEntryType::DIRECTORY);
auto entry = DirEntry::newDirectory(dirId(), formatGcEntry(prefix, UtcClock::now(), inode.id), inode.id, inode.acl);
entry.gcInfo = gcInfo;
CO_RETURN_ON_ERROR(co_await entry.store(txn));
XLOGF(DBG, "GcManager create GC entry {}", entry);
co_return Void{};
}
CoTryTask<void> GcManager::GcDirectory::moveToTail(auto &txn, const GcTask &task, Duration delay) {
// check task entry still exists
auto exists = co_await DirEntry::checkExist(txn, task.taskEntry.parent, task.taskEntry.name);
CO_RETURN_ON_ERROR(exists);
if (*exists) {
auto entry = task.taskEntry;
entry.name = formatGcEntry(prefixOf(task.type), UtcClock::now() + delay.asUs(), task.taskEntry.id);
XLOGF_IF(FATAL, entry.isSymlink(), "entry is symlink {}", entry);
XLOGF_IF(FATAL, !entry.valid(), "Invalid entry {} {}", entry, entry.valid().error());
CO_RETURN_ON_ERROR(co_await entry.store(txn));
CO_RETURN_ON_ERROR(co_await task.taskEntry.remove(txn));
}
co_return Void{};
}
/* GcManager::GcTask */
CoTryTask<flat::UserAttr> GcManager::GcTask::getUserAttr(GcManager &manager, flat::Uid uid) {
auto res = co_await manager.userStore_->getUser(uid);
if (res.hasError() && res.error().code() == StatusCode::kAuthenticationFail) {
// user not found, maybe running in unittest, or user has been removed
auto user = flat::UserAttr();
user.uid = uid;
user.gid = flat::Gid(uid.toUnderType());
user.groups = {};
user.name = fmt::format("user-{}", uid.toUnderType());
co_return user;
}
co_return res;
}
CoTryTask<void> GcManager::GcTask::run(GcManager &manager) {
XLOGF(DBG, "GcTask {} run", taskEntry);
Result<Void> result = makeError(MetaCode::kFoundBug);
switch (taskEntry.type) {
case InodeType::File:
result = co_await gcFile(manager);
break;
case InodeType::Directory:
result = co_await gcDirectory(manager);
break;
case InodeType::Symlink:
default:
XLOGF(FATAL, "Invalid type {}, {}", magic_enum::enum_name(taskEntry.type), taskEntry);
}
if (!result.hasError()) {
co_return Void{};
}
auto code = result.error().code();
auto critical = (StatusCode::typeOf(code) == StatusCodeType::Meta && code != MetaCode::kBusy) // Meta error code
|| (code == StorageClientCode::kInvalidArg || code == StorageClientCode::kChecksumMismatch ||
code == StorageClientCode::kFoundBug); // storage error code
if (critical || code == MetaCode::kBusy) {
if (critical) {
XLOGF(CRITICAL, "GcTask {} run failed, error {}", taskEntry.id, result.error());
gcCritical.addSample(1);
}
co_await manager.runReadWrite([&](IReadWriteTransaction &txn) -> CoTryTask<void> {
co_return co_await gcDir->moveToTail(txn, *this, manager.config_.gc().retry_delay());
});
}
if (code == StorageClientCode::kReadOnlyServer) {
XLOGF(ERR, "GcTask {} run failed, readonly server, {}", taskEntry, result.error());
co_await folly::coro::sleep(std::chrono::seconds(1));
}
co_return result;
}
CoTryTask<void> GcManager::GcTask::gcDirectory(GcManager &manager) {
SemaphoreGuard guard(manager.concurrentGcDirSemaphore_);
co_await guard.coWait();
XLOGF_IF(DFATAL, !taskEntry.isDirectory(), "{} is not directory", taskEntry);
// old version did not record who performed the recursive remove
// just assume that the directory owner performed this operation.
auto uid = taskEntry.gcInfo ? taskEntry.gcInfo->user : taskEntry.dirAcl->uid;
auto user = co_await getUserAttr(manager, uid);
CO_RETURN_ON_ERROR(user);
auto finished = false;
auto checked = false;
auto handler = [&](IReadWriteTransaction &txn) -> CoTryTask<void> {
auto fdbTxn = dynamic_cast<kv::FDBTransaction *>(&txn);
if (fdbTxn && manager.config_.gc().txn_low_priority()) {
fdbTxn->setOption(FDBTransactionOption::FDB_TR_OPTION_PRIORITY_BATCH, {});
}
if (!checked) {
auto loadInode = co_await Inode::load(txn, taskEntry.id);
CO_RETURN_ON_ERROR(loadInode);
if (!loadInode->has_value()) {
// inode is already removed, may happens when retry transaction
XLOGF(ERR, "taskEntry {}, inode already removed", taskEntry);
CO_RETURN_ON_ERROR(co_await removeGcEntryAndInode(manager, txn));
finished = true;
co_return Void{};
}
// sanity check
auto &inode = **loadInode;
if (inode.nlink || !inode.isDirectory() || inode.acl.iflags & FS_IMMUTABLE_FL) {
XLOGF(DFATAL,
"taskEntry {}, inode {}, nlink {}, directory {}, immutable {}",
taskEntry,
inode,
inode.nlink,
inode.isDirectory(),
inode.acl.iflags & FS_IMMUTABLE_FL);
co_return makeError(MetaCode::kFoundBug);
}
Event(Event::Type::GC)
.addField("inode", inode.id)
.addField("owner", inode.acl.uid)
.addField("parent", inode.asDirectory().parent)
.addField("name", inode.asDirectory().name)
.log();
manager.getEventTraceLog().newEntry(MetaEventTrace{.eventType = Event::Type::GC,
.inodeId = inode.id,
.parentId = inode.asDirectory().parent,
.entryName = inode.asDirectory().name,
.ownerId = inode.acl.uid});
checked = true;
}
auto list = co_await DirEntryList::load(txn, taskEntry.id, "", manager.config_.gc().gc_directory_entry_batch());
CO_RETURN_ON_ERROR(list);
for (auto &entry : list->entries) {
CO_RETURN_ON_ERROR(co_await removeEntry(manager, txn, entry, *user));
}
if (!list->more) {
CO_RETURN_ON_ERROR(co_await removeGcEntryAndInode(manager, txn));
finished = true;
}
co_return Void{};
};
while (!finished) {
CO_RETURN_ON_ERROR(co_await manager.runReadWrite(handler));
}
co_return Void{};
}
CoTryTask<void> GcManager::GcTask::gcFile(GcManager &manager) {
SemaphoreGuard guard(manager.concurrentGcFileSemaphore_);
co_await guard.coWait();
assert(taskEntry.isFile());
XLOGF(DBG, "Gc file {}", taskEntry.id);
auto load = co_await manager.runReadOnly([&](auto &txn) -> CoTryTask<std::optional<Inode>> {
auto fdbTxn = dynamic_cast<kv::FDBTransaction *>(&txn);
if (fdbTxn && manager.config_.gc().txn_low_priority()) {
fdbTxn->setOption(FDBTransactionOption::FDB_TR_OPTION_PRIORITY_BATCH, {});
}
if (!manager.config_.gc().check_session()) {
co_return co_await Inode::snapshotLoad(txn, taskEntry.id);
}
auto [inode, session] = co_await folly::coro::collectAll(Inode::snapshotLoad(txn, taskEntry.id),
FileSession::snapshotCheckExists(txn, taskEntry.id));
CO_RETURN_ON_ERROR(inode);
CO_RETURN_ON_ERROR(session);
if (*session) {
XLOGF(CRITICAL, "Delay gc file {}, still has session {}.", taskEntry.id, session.value()->clientId);
gcBusy.addSample(1);
co_return makeError(MetaCode::kBusy, "still have session");
}
// sanity check
if (inode->has_value()) {
if (inode->value().nlink || inode->value().acl.iflags & FS_IMMUTABLE_FL) {
XLOGF(DFATAL,
"taskEntry {}, inode {}, nlink {}, immutable {}",
taskEntry,
**inode,
inode->value().nlink,
inode->value().acl.iflags & FS_IMMUTABLE_FL);
co_return makeError(MetaCode::kFoundBug);
}
}
co_return inode;
});
CO_RETURN_ON_ERROR(load);
auto &inode = *load;
std::optional<Event> event;
std::optional<MetaEventTrace> trace;
if (LIKELY(inode.has_value())) {
auto result = co_await manager.fileHelper_->remove(UserInfo(Uid(0), Gid(0)),
*inode,
manager.config_.gc().retry_remove_chunks(),
manager.config_.gc().remove_chunks_batch_size());
if (result.hasError()) {
XLOGF(ERR, "GcManager failed to remove chunks for {}, error {}", inode->id, result.error());
CO_RETURN_ON_ERROR(result);
}
auto chunks = *result;
XLOGF(DBG, "GcManager removed {} chunks for {}", chunks, inode->id);
chunksDist.addSample(chunks);
event = Event(Event::Type::GC)
.addField("inode", inode->id)
.addField("owner", inode->acl.uid)
.addField("length", inode->asFile().length)
.addField("chunks", chunks);
trace = MetaEventTrace{
.eventType = Event::Type::GC,
.inodeId = inode->id,
.ownerId = inode->acl.uid,
.length = inode->asFile().length,
.removedChunks = chunks,
};
} else {
XLOGF(CRITICAL, "Inode of {} not found, shouldn't happen!!", taskEntry.id);
}
auto remove =
co_await manager.runReadWrite([&](IReadWriteTransaction &txn) { return removeGcEntryAndInode(manager, txn); });
if (remove.hasError()) {
XLOGF(ERR, "GcManager failed to remove GC entry and Inode for {}, error {}", taskEntry.id, remove.error());
CO_RETURN_ON_ERROR(remove);
}
if (event) {
event->log();
}
if (trace) {
manager.getEventTraceLog().append(*trace);
}
co_return Void{};
}
CoTryTask<void> GcManager::GcTask::removeEntry(GcManager &manager,
IReadWriteTransaction &txn,
const DirEntry &entry,
const flat::UserAttr &user) {
if (entry.parent != taskEntry.id) {
XLOGF(DFATAL, "{}.parent != {}.id", entry, taskEntry);
co_return makeError(MetaCode::kFoundBug);
}
auto inode = co_await entry.loadInode(txn);
CO_RETURN_ON_ERROR(inode);
if (inode->id != entry.id) {
XLOGF(DFATAL, "{}.id != {}.id", inode, entry);
co_return makeError(MetaCode::kFoundBug);
}
if (inode->isDirectory() && inode->asDirectory().parent != entry.parent) {
XLOGF(DFATAL, "entry {}, inode {}, different parent", entry, *inode);
co_return makeError(MetaCode::kFoundBug);
}
bool perm = true;
if (inode->acl.iflags & FS_IMMUTABLE_FL) {
perm = false;
} else if (manager.config_.gc().recursive_perm_check() && inode->isDirectory()) {
auto check = inode->acl.checkRecursiveRmPerm(flat::UserInfo(user.uid, user.gid, user.groups), false);
if (check.hasError()) {
// allow remove empty directory
auto empty = co_await DirEntryList::checkEmpty(txn, inode->id);
CO_RETURN_ON_ERROR(empty);
perm = *empty;
}
}
if (!perm) {
// no permission to remove, move into orphan directory
auto orphanEntry = co_await createOrphanEntry(manager, txn, entry, *inode, user);
CO_RETURN_ON_ERROR(orphanEntry);
XLOGF(CRITICAL, "no permission to perform recursive remove {}, move to {}", entry, *orphanEntry);
if (inode->isDirectory()) {
inode->asDirectory().parent = orphanEntry->parent;
CO_RETURN_ON_ERROR(co_await inode->store(txn));
}
CO_RETURN_ON_ERROR(co_await entry.remove(txn));
co_return Void{};
}
// can remove this entry
auto gcInfo = GcInfo();
gcInfo.user = taskEntry.gcInfo ? taskEntry.gcInfo->user : taskEntry.dirAcl->uid;
gcInfo.origPath = taskEntry.gcInfo ? taskEntry.gcInfo->origPath / entry.name : Path(entry.name);
CO_RETURN_ON_ERROR(co_await manager.removeEntry(txn, entry, *inode, gcInfo));
co_return Void{};
}
CoTryTask<DirEntry> GcManager::GcTask::createOrphanEntry(GcManager &manager,
IReadWriteTransaction &txn,
const DirEntry &entry,
const Inode &inode,
const flat::UserAttr &user) {
XLOGF_IF(FATAL, entry.id != inode.id, "entry {}, inode {}", entry, inode);
auto orphanDir = Path(fmt::format("trash/gc-orphans/{}-{:%Y%m%d}", user.name, UtcClock::now()));
if (inode.isFile()) {
orphanDir = orphanDir / fmt::format("{}", entry.parent);
}
auto orphanName = entry.name;
auto allocateInodeId = [&]() -> CoTryTask<InodeId> {
auto id = co_await manager.idAlloc_->allocate();
CO_RETURN_ON_ERROR(id);
auto load = co_await Inode::load(txn, *id);
CO_RETURN_ON_ERROR(load);
if (load->has_value()) {
auto &inode = **load;
XLOGF(FATAL, "Found duplicated InodeId {}, {}", *id, inode);
}
co_return *id;
};
// create orphan directory
auto parent = InodeId::root();
for (const auto &iter : orphanDir) {
assert(!iter.empty());
const auto &fname = iter.string();
for (size_t i = 0; true; i++) {
auto name = i == 0 ? fname : fmt::format("{}.{}", fname.substr(0, 240), i);
auto entry = co_await DirEntry::load(txn, parent, name);
CO_RETURN_ON_ERROR(entry);
auto found = entry->has_value();
if (found) {
if (entry.value()->isDirectory()) {
parent = entry.value()->id;
break;
}
XLOGF(WARN, "entry {} exists, but not directory", **entry);
continue;
}
auto id = co_await allocateInodeId();
CO_RETURN_ON_ERROR(id);
auto acl = Acl(flat::Uid(0), flat::Gid(0), flat::Permission(0755));
auto newEntry = DirEntry::newDirectory(parent, name, *id, acl);
CO_RETURN_ON_ERROR(co_await newEntry.store(txn));
auto newInode = Inode::newDirectory(*id, parent, name, acl, Layout(), UtcClock::now().castGranularity(1_s));
CO_RETURN_ON_ERROR(co_await newInode.store(txn));
parent = *id;
break;
}
}
if (taskEntry.gcInfo) {
// create a symlink under directory, give original path
auto symlinkParent = inode.isDirectory() ? inode.id : parent;
auto origPath = inode.isDirectory() ? taskEntry.gcInfo->origPath / entry.name : taskEntry.gcInfo->origPath;
for (size_t i = 0; true; i++) {
auto symlinkName =
i == 0 ? "_hf3fs_original_path" : fmt::format("_hf3fs_original_path.{}", UtcClock::now().toMicroseconds());
auto entry = co_await DirEntry::load(txn, symlinkParent, symlinkName);
CO_RETURN_ON_ERROR(entry);
if (entry->has_value()) {
if (entry->value().isSymlink()) {
auto inode = co_await entry->value().loadInode(txn);
CO_RETURN_ON_ERROR(inode);
if (inode->asSymlink().target == origPath) {
break;
}
}
continue;
}
auto id = co_await allocateInodeId();
CO_RETURN_ON_ERROR(id);
auto symlinkInode = Inode::newSymlink(*id,
origPath,
taskEntry.gcInfo->user,
flat::Gid(taskEntry.gcInfo->user.toUnderType()),
UtcClock::now().castGranularity(1_s));
auto symlinkEntry = DirEntry::newSymlink(symlinkParent, symlinkName, *id);
CO_RETURN_ON_ERROR(co_await symlinkInode.store(txn));
CO_RETURN_ON_ERROR(co_await symlinkEntry.store(txn));
break;
}
}
for (size_t i = 0; true; i++) {
auto name = i == 0 ? orphanName : fmt::format("{}.{}", orphanName.substr(0, 230), UtcClock::now().toMicroseconds());
auto check = co_await DirEntry::checkExist(txn, parent, name);
CO_RETURN_ON_ERROR(check);
if (auto exists = *check; !exists) {
auto orphanEntry = entry;
orphanEntry.parent = parent;
orphanEntry.name = name;
CO_RETURN_ON_ERROR(co_await orphanEntry.store(txn));
co_return orphanEntry;
}
}
}
CoTryTask<void> GcManager::GcTask::removeGcEntryAndInode(GcManager &manager, IReadWriteTransaction &txn) {
CO_RETURN_ON_ERROR(co_await taskEntry.remove(txn, true));
CO_RETURN_ON_ERROR(co_await Inode(taskEntry.id).remove(txn));
co_return Void{};
}
CoTryTask<void> GcManager::init() {
XLOGF(INFO, "GcManager::init");
auto check = co_await checkFs();
if (check.hasError()) {
XLOGF(ERR, "GcManager::checkFs failed, error {}", check.error());
CO_RETURN_ERROR(check);
}
// each meta server may have 5 GC directory
for (size_t i = 0; i < kNumGcDirectoryPerServer; i++) {
auto gcDirectory = co_await openGcDirectory(i, i != 0);
if (gcDirectory.hasError()) {
XLOGF(ERR, "GcManager::openGcDirectory({}, {}) failed, error {}", i, i != 0, gcDirectory.error());
CO_RETURN_ERROR(gcDirectory);
}
if (*gcDirectory) {
currGcDirectories_.push_back(*gcDirectory);
}
}
XLOGF_IF(FATAL, currGcDirectories_.empty(), "currGcDirectories_.empty()");
XLOGF(INFO, "GcManager::init success.");
co_return Void{};
}
void GcManager::start(CPUExecutorGroup &exec) {
XLOGF(DBG, "GcManager start.");
XLOGF_IF(FATAL, currGcDirectories_.empty(), "GcDirectory not set!!!");
// start GC workers
gcWorkers_ = std::make_unique<PriorityCoroutinePool<GcTask>>(config_.gc().workers());
gcWorkers_->start(folly::partial(&GcManager::runGcTask, this), exec);
// start GC scanner
gcRunner_ = std::make_unique<BackgroundRunner>(&exec.pickNext());
gcRunner_->start(
"ScanAllGcDirs",
[&]() -> CoTask<void> {
if (config_.gc().enable()) {
auto result = co_await this->scanAllGcDirectories();
XLOGF_IF(ERR,
result.hasError(),
"GcManager failed to scan all available GC directories, error {}",
result.error());
}
},
[]() { return 30_s; });
for (const auto &gcDir : currGcDirectories_) {
gcDir->start(*this, exec);
}
XLOGF(INFO, "GcManager started!");
}
void GcManager::stopAndJoin() {
XLOGF(INFO, "GcManager stop.");
for (const auto &gcDir : currGcDirectories_) {
gcDir->stopAndJoin();
}
if (gcRunner_) {
folly::coro::blockingWait(gcRunner_->stopAll());
gcRunner_.reset();
}
if (gcWorkers_) {
gcWorkers_->stopAndJoin();
gcWorkers_.reset();
}
XLOGF(INFO, "GcManager stopped!");
}
CoTryTask<void> GcManager::checkFs() {
co_return co_await runReadOnly([&](auto &txn) -> CoTryTask<void> {
// check tree roots exist
auto exists = [](auto &val) { return val.has_value(); };
auto root = (co_await Inode::snapshotLoad(txn, InodeId::root())).then(exists);
auto gcRoot = (co_await Inode::snapshotLoad(txn, InodeId::gcRoot())).then(exists);
CO_RETURN_ON_ERROR(root);
CO_RETURN_ON_ERROR(gcRoot);
if (!*root || !*gcRoot) {
XLOGF(CRITICAL, "Root or GcRoot not found, root {}, gcRoot {}", root, gcRoot);
co_return makeError(MetaCode::kBadFileSystem);
}
co_return Void{};
});
}
CoTryTask<std::shared_ptr<GcManager::GcDirectory>> GcManager::openGcDirectory(size_t idx, bool create) {
// generate GC directory name based on nodeId
auto gcDirectoryName = GcDirectory::nameOf(nodeId_, idx);
XLOGF(INFO, "Open GC directory {}/{}", InodeId::gcRoot(), gcDirectoryName);
co_return co_await runReadWrite([&](IReadWriteTransaction &txn) -> CoTryTask<std::shared_ptr<GcDirectory>> {
auto entry = co_await DirEntry::load(txn, InodeId::gcRoot(), gcDirectoryName);
CO_RETURN_ON_ERROR(entry);
if (entry->has_value()) {
XLOGF(INFO, "GC directory {}/{} -> {} exists", InodeId::gcRoot(), gcDirectoryName, **entry);
co_return std::make_shared<GcDirectory>(**entry);
}
if (!create) {
co_return std::shared_ptr<GcDirectory>();
}
// chose InodeId randomly
std::vector<InodeId> inodeIds;
while (inodeIds.size() < 512) {
auto newId = co_await idAlloc_->allocate();
CO_RETURN_ON_ERROR(newId);
inodeIds.push_back(*newId);
}
std::shuffle(inodeIds.begin(), inodeIds.end(), std::mt19937(std::random_device()()));
auto newId = inodeIds.front();
auto gcDir = Inode::newDirectory(newId,
InodeId::gcRoot(),
gcDirectoryName,
Acl::gcRoot(),
Layout() /* Invalid layout */,
UtcClock::now());
auto gcEntry = DirEntry::newDirectory(InodeId::gcRoot(), gcDirectoryName, newId, Acl::gcRoot());
XLOGF(INFO,
"GC directory {}/{} not found, create it: id {}, entry {}.",
InodeId::gcRoot(),
gcDirectoryName,
newId,
gcEntry);
CO_RETURN_ON_ERROR(co_await gcDir.store(txn));
CO_RETURN_ON_ERROR(co_await gcEntry.store(txn));
co_return std::make_shared<GcDirectory>(gcEntry);
});
}
CoTryTask<void> GcManager::removeEntry(IReadWriteTransaction &txn, const DirEntry &entry, Inode &inode, GcInfo gcInfo) {
XLOGF(DBG, "GcManager remove entry {}", entry);
if (inode.nlink == 0) {
auto msg = fmt::format("DirEntry {} exists, but inode {}'s nlink is 0, shouldn't happen!!!", entry, inode);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kInconsistent, msg);
}
if (inode.acl.iflags & FS_IMMUTABLE_FL) {
auto msg = fmt::format("can't remove inode {} with FS_IMMUTABLE_FL", inode.id);
XLOG(CRITICAL, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
inode.nlink--;
SetAttr::update(inode.ctime, UtcClock::now(), config_.time_granularity(), true /* cmp */);
// add into read conflict set
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
if (entry.isSymlink()) {
// remove symlink
CO_RETURN_ON_ERROR(co_await entry.remove(txn));
if (inode.nlink != 0) {
CO_RETURN_ON_ERROR(co_await inode.store(txn));
} else {
CO_RETURN_ON_ERROR(co_await inode.remove(txn));
}
} else {
// remove directory or file
CO_RETURN_ON_ERROR(co_await inode.store(txn));
CO_RETURN_ON_ERROR(co_await entry.remove(txn));
if (inode.isDirectory()) {
if (inode.nlink != 0) {
XLOGF(DFATAL, "Directory {} nlink != 0", inode);
co_return makeError(MetaCode::kFoundBug);
}
if (inode.asDirectory().parent != entry.parent) {
XLOGF(DFATAL, "Directory inode {}, entry {}, parent not match", inode.asDirectory(), entry);
co_return makeError(MetaCode::kFoundBug);
}
}
if (inode.nlink != 0) {
// this is not last reference, can't remove
XLOGF(DBG, "Inode {} has {} links after remove {}", inode.id, inode.nlink, entry);
} else {
auto gcDirectory = pickGcDirectory();
CO_RETURN_ON_ERROR(co_await gcDirectory->add(txn, inode, config_.gc(), gcInfo));
}
}
co_return Void{};
}
CoTryTask<void> GcManager::scanAllGcDirectories() {
std::vector<DirEntry> entries;
while (true) {
auto result = co_await runReadOnly([&](auto &txn) -> CoTryTask<DirEntryList> {
auto prev = entries.empty() ? "" : entries.back().name;
co_return co_await DirEntryList::snapshotLoad(txn, InodeId::gcRoot(), prev, 128);
});
CO_RETURN_ON_ERROR(result);
entries.insert(entries.end(), result->entries.begin(), result->entries.end());
if (!result->more) {
break;
}
}
std::map<std::string, flat::NodeId> active;
if (auto routing = mgmtd_->getRoutingInfo(); routing) {
auto nodes = routing->getNodeBy(flat::selectNodeByType(flat::NodeType::META) && flat::selectActiveNode());
for (auto &node : nodes) {
// skip GC directory 0
for (size_t i = 1; i < kNumGcDirectoryPerServer; i++) {
active.insert_or_assign(GcDirectory::nameOf(node.app.nodeId, i), node.app.nodeId);
}
}
}
std::vector<std::shared_ptr<GcDirectory>> activeDirs;
std::set<std::string> activeNames, inactiveNames;
for (auto &entry : entries) {
if (active.contains(entry.name) || std::any_of(currGcDirectories_.begin(),
currGcDirectories_.end(),
[&](const auto &gcDir) { return entry.name == gcDir->name(); })) {
activeDirs.emplace_back(std::make_shared<GcDirectory>(entry));
activeNames.insert(entry.name);
} else {
inactiveNames.insert(entry.name);
}
}
XLOGF(INFO,
"GcManager found {} GC directories, active {}, inactive {}",
entries.size(),
fmt::join(activeNames.begin(), activeNames.end(), ","),
fmt::join(inactiveNames.begin(), inactiveNames.end(), ","));
for (const auto &gcDir : currGcDirectories_) {
XLOGF_IF(FATAL,
!activeNames.contains(gcDir->name()),
"Current GC Directory {} not found under GcRoot",
gcDir->name());
}
allGcDirectories_.withWLock([&](auto &val) { val = activeDirs; });
co_return Void{};
}
bool GcManager::enableGcDelay() const {
if (auto fsStatus = fileHelper_->cachedFsStatus(); fsStatus.has_value()) {
auto free = (double)fsStatus->free / fsStatus->capacity * 100;
if (free < config_.gc().gc_delay_free_space_threshold()) {
XLOGF_EVERY_MS(WARN,
5000,
"free space {} < {}, disable GC delay",
free,
config_.gc().gc_delay_free_space_threshold());
return false;
}
return true;
}
XLOGF_EVERY_MS(WARN, 5000, "GcManager failed to get FsStatus");
return true;
}
CoTask<void> GcManager::runGcTask(GcTask task) {
SCOPE_EXIT { task.gcDir->finish(task); };
if (!config_.gc().enable()) {
co_return;
}
auto begin = SteadyClock::now();
auto result = co_await task.run(*this);
if (result.hasError()) {
XLOGF(ERR, "GC {} failed, error {}", task.taskEntry.id, result.error());
gcFailCount.addSample(1);
} else {
XLOGF(DBG, "GC {} success", task.taskEntry);
gcSuccCount.addSample(1, {{"instance", task.taskEntry.isDirectory() ? "directory" : "file"}});
gcLatency.addSample(SteadyClock::now() - begin);
}
co_return;
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,296 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <folly/Executor.h>
#include <folly/Random.h>
#include <folly/Synchronized.h>
#include <folly/Utility.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/logging/xlog.h>
#include <gtest/gtest_prod.h>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <variant>
#include <vector>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/mgmtd/MgmtdClient.h"
#include "client/storage/StorageClient.h"
#include "common/app/ApplicationBase.h"
#include "common/app/NodeId.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/CountDownLatch.h"
#include "common/utils/Duration.h"
#include "common/utils/PriorityCoroutinePool.h"
#include "common/utils/Result.h"
#include "common/utils/Semaphore.h"
#include "common/utils/UtcTime.h"
#include "core/user/UserStoreEx.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fdb/FDBRetryStrategy.h"
#include "fmt/core.h"
#include "meta/base/Config.h"
#include "meta/components/InodeIdAllocator.h"
#include "meta/components/SessionManager.h"
#include "meta/event/Event.h"
#include "meta/store/DirEntry.h"
#include "meta/store/FileSession.h"
#include "scn/scan/scan.h"
namespace hf3fs::meta::server {
class FileHelper;
using hf3fs::client::ICommonMgmtdClient;
class GcManager {
public:
static Result<std::pair<UtcTime, InodeId>> parseGcEntry(std::string_view entry) {
char prefix;
uint64_t timestamp;
uint64_t inode;
auto ret = scn::scan(entry, "{}-{}-{:i}", prefix, timestamp, inode);
if (!ret) {
return makeError(StatusCode::kInvalidArg);
}
return std::pair<UtcTime, InodeId>{UtcTime::fromMicroseconds(timestamp), InodeId(inode)};
}
static std::string formatGcEntry(char prefix, UtcTime timestamp, InodeId inode) {
return fmt::format("{}-{:020d}-{}", prefix, (uint64_t)timestamp.toMicroseconds(), inode.toHexString());
}
GcManager(const Config &config,
flat::NodeId nodeId,
analytics::StructuredTraceLog<MetaEventTrace> &metaEventTraceLog,
std::shared_ptr<kv::IKVEngine> kvEngine,
std::shared_ptr<ICommonMgmtdClient> mgmtd,
std::shared_ptr<InodeIdAllocator> idAlloc,
std::shared_ptr<FileHelper> fileHelper,
std::shared_ptr<SessionManager> sessionManager,
std::shared_ptr<core::UserStoreEx> userStore)
: config_(config),
nodeId_(nodeId),
metaEventTraceLog_(metaEventTraceLog),
kvEngine_(kvEngine),
mgmtd_(mgmtd),
idAlloc_(idAlloc),
fileHelper_(fileHelper),
sessionManager_(sessionManager),
userStore_(userStore),
concurrentGcDirSemaphore_(config_.gc().gc_directory_concurrent()),
concurrentGcFileSemaphore_(config_.gc().gc_file_concurrent()) {
XLOGF_IF(FATAL, !nodeId_, "invalid node id {}", nodeId_);
guard_ = config_.gc().addCallbackGuard([&]() {
auto dirConcurrent = config_.gc().gc_directory_concurrent();
if (dirConcurrent != 0 && dirConcurrent != concurrentGcDirSemaphore_.getUsableTokens()) {
XLOGF(INFO, "GcManager set gc directory concurrent to {}", dirConcurrent);
concurrentGcDirSemaphore_.changeUsableTokens(dirConcurrent);
XLOGF(INFO, "GcManager finished update gc directory concurrent");
}
auto fileConcurrent = config_.gc().gc_file_concurrent();
if (fileConcurrent != 0 && fileConcurrent != concurrentGcFileSemaphore_.getUsableTokens()) {
XLOGF(INFO, "GcManager set gc directory concurrent to {}", fileConcurrent);
concurrentGcFileSemaphore_.changeUsableTokens(fileConcurrent);
XLOGF(INFO, "GcManager finished update gc file concurrent");
}
});
}
CoTryTask<void> init();
void start(CPUExecutorGroup &exec);
void stopAndJoin();
auto &getEventTraceLog() { return metaEventTraceLog_; }
CoTryTask<void> removeEntry(IReadWriteTransaction &txn, const DirEntry &entry, Inode &inode, GcInfo gcInfo);
private:
template <typename T>
FRIEND_TEST(TestRemove, GC);
enum GcEntryType {
DIRECTORY = 0,
FILE_MEDIUM,
FILE_LARGE,
FILE_SMALL,
MAX,
};
class GcDirectory;
struct GcTask {
std::shared_ptr<GcDirectory> gcDir;
GcEntryType type;
DirEntry taskEntry;
GcTask(std::shared_ptr<GcDirectory> gcDir, GcEntryType type, DirEntry entry)
: gcDir(std::move(gcDir)),
type(type),
taskEntry(std::move(entry)) {}
static CoTryTask<flat::UserAttr> getUserAttr(GcManager &manager, flat::Uid uid);
CoTryTask<void> run(GcManager &manager);
CoTryTask<void> gcDirectory(GcManager &manager);
CoTryTask<void> gcFile(GcManager &manager);
CoTryTask<void> removeEntry(GcManager &manager,
IReadWriteTransaction &txn,
const DirEntry &entry,
const flat::UserAttr &user);
CoTryTask<void> removeGcEntryAndInode(GcManager &manager, IReadWriteTransaction &txn);
CoTryTask<DirEntry> createOrphanEntry(GcManager &manager,
IReadWriteTransaction &txn,
const DirEntry &entry,
const Inode &inode,
const flat::UserAttr &user);
};
class GcDirectory : folly::MoveOnly, public std::enable_shared_from_this<GcDirectory> {
public:
using Ptr = std::shared_ptr<GcDirectory>;
static char prefixOf(GcEntryType type) {
switch (type) {
case DIRECTORY:
return 'd';
case FILE_MEDIUM:
return 'f';
case FILE_LARGE:
return 'L';
case FILE_SMALL:
return 'S';
default:
XLOGF(FATAL, "invalid type {}", (int)type);
}
}
int8_t priorityOf(GcEntryType type) {
switch (type) {
case DIRECTORY:
return folly::Executor::MID_PRI;
case FILE_MEDIUM:
return folly::Executor::MID_PRI;
case FILE_LARGE:
return folly::Executor::HI_PRI;
case FILE_SMALL:
return folly::Executor::LO_PRI;
default:
XLOGF(FATAL, "invalid type {}", (int)type);
}
}
static std::string nameOf(flat::NodeId nodeId, size_t idx) {
return idx == 0 ? fmt::format("GC-Node-{}", (uint32_t)nodeId)
: fmt::format("GC-Node-{}.{}", (uint32_t)nodeId, idx);
}
GcDirectory(DirEntry entry)
: entry_(std::move(entry)) {}
~GcDirectory() { stopAndJoin(); }
void start(GcManager &manager, CPUExecutorGroup &exec);
void stopAndJoin();
auto dirId() const { return entry_.id; }
std::string name() const { return entry_.name; }
CoTryTask<void> add(auto &txn, const Inode &inode, const GcConfig &config, GcInfo gcInfo);
void finish(const GcTask &task);
CoTryTask<void> moveToTail(auto &txn, const GcTask &task, Duration delay);
private:
struct QueueState {
folly::Synchronized<std::set<InodeId>, std::mutex> queued;
folly::Synchronized<std::set<InodeId>, std::mutex> finished;
std::atomic<uint64_t> counter{0};
};
CoTask<void> scan(GcManager &manager, GcEntryType type);
CoTryTask<bool> scan(GcManager &manager,
GcEntryType type,
Duration delay,
size_t limit,
std::optional<std::string> &prev);
CoTryTask<void> addFile(auto &txn, const Inode &inode, const GcConfig &config);
CoTryTask<void> addDirectory(auto &txn, const Inode &inode, GcInfo gcInfo);
DirEntry entry_; // entry points to this GcDirectory
std::array<QueueState, GcEntryType::MAX> states_;
CancellationSource cancel_;
CountDownLatch<> latch_;
};
const std::vector<GcDirectory::Ptr> &currGcDirectories() const { return currGcDirectories_; }
GcDirectory::Ptr pickGcDirectory() {
XLOGF_IF(FATAL, currGcDirectories_.empty(), "currGcDirectories_.empty()");
if (config_.gc().distributed_gc()) {
auto guard = allGcDirectories_.rlock();
if (!guard->empty()) {
return guard->at(folly::Random::rand64(guard->size()));
}
}
if (currGcDirectories_.size() == 1) {
return currGcDirectories_[0];
} else {
return currGcDirectories_[folly::Random::rand32(1, currGcDirectories_.size())];
}
}
bool enableGcDelay() const;
CoTryTask<void> checkFs();
CoTryTask<std::shared_ptr<GcDirectory>> openGcDirectory(size_t idx, bool create);
CoTryTask<void> scanAllGcDirectories();
CoTask<void> runGcTask(GcTask task);
template <typename H>
std::invoke_result_t<H, IReadOnlyTransaction &> runReadOnly(H &&handler) {
auto retry = kv::FDBRetryStrategy({1_s, 10, true});
co_return co_await kv::WithTransaction(retry).run(kvEngine_->createReadonlyTransaction(), std::forward<H>(handler));
}
template <typename H>
std::invoke_result_t<H, IReadWriteTransaction &> runReadWrite(H &&handler) {
auto retry = kv::FDBRetryStrategy({1_s, 10, true});
co_return co_await kv::WithTransaction(retry).run(kvEngine_->createReadWriteTransaction(),
std::forward<H>(handler));
}
const Config &config_;
std::unique_ptr<config::ConfigCallbackGuard> guard_;
flat::NodeId nodeId_;
analytics::StructuredTraceLog<MetaEventTrace> &metaEventTraceLog_;
std::shared_ptr<kv::IKVEngine> kvEngine_;
std::shared_ptr<ICommonMgmtdClient> mgmtd_;
std::shared_ptr<InodeIdAllocator> idAlloc_;
std::shared_ptr<FileHelper> fileHelper_;
std::shared_ptr<SessionManager> sessionManager_;
std::shared_ptr<core::UserStoreEx> userStore_;
std::vector<GcDirectory::Ptr> currGcDirectories_;
folly::Synchronized<std::vector<GcDirectory::Ptr>> allGcDirectories_;
std::unique_ptr<BackgroundRunner> gcRunner_;
std::unique_ptr<PriorityCoroutinePool<GcTask>> gcWorkers_;
Semaphore concurrentGcDirSemaphore_;
Semaphore concurrentGcFileSemaphore_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,73 @@
#include "meta/components/InodeIdAllocator.h"
#include <algorithm>
#include <cassert>
#include <chrono>
#include <fmt/core.h>
#include <folly/Likely.h>
#include <folly/ScopeGuard.h>
#include <folly/Unit.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/experimental/coro/Promise.h>
#include <folly/experimental/coro/Sleep.h>
#include <folly/experimental/coro/Timeout.h>
#include <folly/io/async/Request.h>
#include <folly/logging/xlog.h>
#include <mutex>
#include <optional>
#include "common/kv/KeyPrefix.h"
#include "common/utils/Coroutine.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#define FAULT_INJECTION_INODE_ID_ALLOCATOR true
namespace hf3fs::meta::server {
std::string InodeIdAllocator::kAllocatorKeyPrefix = fmt::format("{}-inode-alloc", kv::toStr(kv::KeyPrefix::Single));
CoTryTask<meta::InodeId> InodeIdAllocator::allocateSlow(std::chrono::microseconds timeout) {
tryStartAllocateTask(co_await folly::coro::co_current_executor);
auto result = co_await folly::coro::co_awaitTry(folly::coro::timeout(queue_.dequeue(), timeout));
if (UNLIKELY(result.hasException())) {
co_return makeError(MetaCode::kInodeIdAllocFailed);
}
if (UNLIKELY(queue_.size() < kAllocateBatch / 2)) {
tryStartAllocateTask(co_await folly::coro::co_current_executor);
}
co_return result.value();
}
CoTask<void> InodeIdAllocator::allocateFromDB() {
auto result = co_await allocator_.allocate();
if (UNLIKELY(result.hasError())) {
XLOGF(CRITICAL, "Failed to allocate InodeId {}", result.error().describe());
// allocation failed, wait sometime and retry in new task
startAllocateTask(co_await folly::coro::co_current_executor);
co_return;
}
if (UNLIKELY((result.value() & ~kAllocatorMask) != 0)) {
XLOGF(FATAL, "64bit InodeId used up, should never happen, {}!!!", result.value());
}
auto first = result.value() << kAllocatorShift;
XLOGF(DBG,
"Get {} from IdAllocator, corresponding to InodeId {} - {}",
result.value(),
meta::InodeId(first),
meta::InodeId(first + kAllocateBatch - 1));
for (uint64_t i = 0; i < kAllocateBatch; i++) {
meta::InodeId id(first + i);
co_await queue_.enqueue(id);
}
allocating_.store(false);
if (UNLIKELY(queue_.size() < kAllocateBatch / 2)) {
tryStartAllocateTask(co_await folly::coro::co_current_executor);
}
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,134 @@
#pragma once
#include <algorithm>
#include <atomic>
#include <chrono>
#include <cstddef>
#include <deque>
#include <fmt/core.h>
#include <folly/Executor.h>
#include <folly/Likely.h>
#include <folly/Range.h>
#include <folly/Synchronized.h>
#include <folly/ThreadLocal.h>
#include <folly/Utility.h>
#include <folly/experimental/coro/Baton.h>
#include <folly/experimental/coro/BoundedQueue.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/experimental/coro/Mutex.h>
#include <folly/experimental/coro/Promise.h>
#include <folly/experimental/coro/Sleep.h>
#include <folly/experimental/coro/Timeout.h>
#include <folly/fibers/BatchSemaphore.h>
#include <folly/io/async/Request.h>
#include <folly/logging/xlog.h>
#include <list>
#include <memory>
#include <mutex>
#include <optional>
#include <queue>
#include <string>
#include <string_view>
#include <utility>
#include "common/kv/IKVEngine.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Coroutine.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/IdAllocator.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "fdb/FDBRetryStrategy.h"
namespace hf3fs::meta::server {
/**
* Generate InodeId range from 0x00000000_00001000 to 0x01ffffff_ffffffff.
*
* Generated InodeId format: [ high 52bits: generated by IdAllocator ][ low 12 bits: local generated ].
* InodeIdAllocator first use IdAllocator to generate a 52bits value, then left shift 12 bits and generate lower 12 bits
* locally. So it only need to access the FoundationDB after generate 4096 InodeIds.
*/
class InodeIdAllocator : public std::enable_shared_from_this<InodeIdAllocator> {
// These values are used in FoundationDB
static std::string kAllocatorKeyPrefix;
static constexpr size_t kAllocatorShard = 32; // avoid txn conflictation
static constexpr uint64_t kAllocatorShift = 12; // shift 12 bit
static constexpr uint64_t kAllocatorBit =
64 - kAllocatorShift; // IdAllocator generated values only have 52bits valid.
static constexpr uint64_t kAllocatorMask = (1ULL << kAllocatorBit) - 1;
static constexpr uint64_t kAllocateBatch = 1 << kAllocatorShift;
struct Tag {};
public:
InodeIdAllocator(Tag, std::shared_ptr<kv::IKVEngine> kvEngine)
: engine_(std::move(kvEngine)),
allocator_(*engine_, createRetryStrategy(), kAllocatorKeyPrefix, kAllocatorShard),
allocating_(false),
queue_(2 * kAllocateBatch) {}
static std::shared_ptr<InodeIdAllocator> create(std::shared_ptr<kv::IKVEngine> kvEngine) {
return std::make_shared<InodeIdAllocator>(Tag{}, std::move(kvEngine));
}
CoTryTask<InodeId> allocate(std::chrono::microseconds timeout = std::chrono::seconds(2)) {
static monitor::CountRecorder failed("meta_inodeid_alloc_failed");
auto id = queue_.try_dequeue();
if (LIKELY(id.has_value())) {
if (queue_.size() < kAllocateBatch / 2) {
tryStartAllocateTask(co_await folly::coro::co_current_executor);
}
co_return id.value();
}
auto result = co_await allocateSlow(timeout);
if (result.hasError()) {
failed.addSample(1);
co_return result;
}
if (result->u64() >= InodeId::kNewChunkEngineMask) {
failed.addSample(1);
XLOGF(DFATAL, "InodeId {} is larger than", *result, InodeId(InodeId::kNewChunkEngineMask));
co_return makeError(MetaCode::kInodeIdAllocFailed, "InodeId too large, shouldn't happen");
}
co_return result;
}
private:
static kv::FDBRetryStrategy createRetryStrategy() { return kv::FDBRetryStrategy({.retryMaybeCommitted = true}); }
static CoTask<void> allocateTask(std::weak_ptr<InodeIdAllocator> weak,
std::optional<folly::Duration> delay = std::nullopt) {
if (delay.has_value()) {
co_await folly::coro::sleep(delay.value());
}
auto ptr = weak.lock();
if (ptr) {
co_await ptr->allocateFromDB();
}
co_return;
}
void tryStartAllocateTask(folly::Executor *exec) {
if (!allocating_.exchange(true)) {
startAllocateTask(exec);
}
}
void startAllocateTask(folly::Executor *exec) {
folly::RequestContextScopeGuard guard;
allocateTask(weak_from_this()).scheduleOn(exec).start();
}
CoTryTask<InodeId> allocateSlow(std::chrono::microseconds timeout);
CoTask<void> allocateFromDB();
std::shared_ptr<kv::IKVEngine> engine_;
IdAllocator<kv::FDBRetryStrategy> allocator_;
std::atomic<bool> allocating_;
folly::coro::BoundedQueue<InodeId, false, false> queue_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,334 @@
#include "meta/components/SessionManager.h"
#include <cassert>
#include <cstdint>
#include <fmt/format.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/functional/Partial.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include <set>
#include <string_view>
#include <utility>
#include <vector>
#include "common/app/ApplicationBase.h"
#include "common/app/NodeId.h"
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/monitor/Recorder.h"
#include "common/serde/Serde.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/Coroutine.h"
#include "common/utils/OptionalUtils.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "common/utils/Uuid.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fbs/meta/Utils.h"
#include "fdb/FDBRetryStrategy.h"
#include "meta/components/FileHelper.h"
#include "meta/store/FileSession.h"
namespace hf3fs::meta::server {
namespace {
monitor::CountRecorder pruned("meta_server.sessions_pruned");
monitor::CountRecorder pruneFailed("meta_server.sessions_prune_failed");
CoTryTask<std::set<ClientId>> getActiveClients(client::ICommonMgmtdClient &mgmtd,
bool allowBootstrapping,
std::optional<Duration> timeout = std::nullopt) {
auto result = co_await mgmtd.listClientSessions();
if (result.hasError()) {
XLOGF(ERR, "Failed to list active clients, error {}", result.error());
CO_RETURN_ERROR(result);
}
if (result->bootstrapping && !allowBootstrapping) {
XLOGF(INFO, "Failed to list active clients, mgmtd is bootstrapping.");
co_return makeError(MgmtdClientCode::kRoutingInfoNotReady);
}
std::set<ClientId> clients;
for (const auto &session : result->sessions) {
auto uuid = Uuid::fromHexString(session.clientId);
if (uuid.hasError()) {
XLOGF(DFATAL, "Failed to parse client {} id {}, error {}", session.description, session.clientId, uuid.error());
continue;
}
if (*uuid == Uuid::zero()) {
XLOGF(DFATAL, "Client {} uuid {} is zero", session.description, session.clientId);
continue;
}
if (timeout.has_value() && session.lastExtend + *timeout + 10_s < UtcClock::now()) {
XLOGF(WARN, "Client {} timeout, last extended {}, ", session.description, session.lastExtend);
continue;
}
clients.emplace(*uuid);
}
co_return clients;
}
} // namespace
/** SessionManager::ScanTask */
CoTryTask<size_t> SessionManager::ScanTask::run(SessionManager &manager) {
XLOGF(DBG, "ScanTask-{} start", shard_);
std::map<std::string, uint64_t> map;
SCOPE_EXIT {
for (const auto &[host, cnt] : map) {
XLOGF(INFO, "SessionManager found {} sessions for dead clients {}", cnt, host);
}
};
// get all active clients
auto ts = UtcClock::now();
auto active = co_await getActiveClients(*manager.mgmtd_, false, manager.config_.session_timeout());
CO_RETURN_ON_ERROR(active);
size_t total = 0;
std::optional<FileSession> prev;
while (true) {
// scan sessions
auto txn = manager.kvEngine_->createReadonlyTransaction();
auto sessions = co_await kv::WithTransaction(kv::FDBRetryStrategy{})
.run(std::move(txn), [&](auto &txn) -> CoTryTask<std::vector<FileSession>> {
co_return co_await FileSession::scan(txn, shard_, prev);
});
CO_RETURN_ON_ERROR(sessions);
if (sessions->empty()) {
break;
}
// filter dead sessions
std::vector<FileSession> deadSessions;
for (auto &session : *sessions) {
if (prune_->sessions.rlock()->contains(session.sessionId)) {
// need prune this session
XLOGF(INFO, "Need prune session {}", session);
prune_->sessions.wlock()->erase(session.sessionId);
} else {
// check client is active or not
if (active->contains(session.clientId)) {
continue;
}
if (session.timestamp + 1_min > ts) {
// concurrent create session and scan
auto now = UtcClock::now();
XLOGF_IF(WARN, session.timestamp > now + 5_s, "Session timestamp {} > now {}", session.timestamp, now);
continue;
}
XLOGF(WARN, "SessionManager found dead session {}", session);
}
deadSessions.push_back(session);
}
prev = sessions->back();
// prune dead sessions
if (manager.config_.sync_on_prune_session()) {
for (auto &session : deadSessions) {
co_await manager.closeWorkers_->enqueue(std::make_unique<CloseTask>(session));
}
} else {
auto txn = manager.kvEngine_->createReadWriteTransaction();
auto result =
co_await kv::WithTransaction(kv::FDBRetryStrategy{}).run(std::move(txn), [&](auto &txn) -> CoTryTask<Void> {
for (auto &session : deadSessions) {
CO_RETURN_ON_ERROR(co_await session.remove(txn));
}
co_return Void{};
});
if (result.hasError()) {
pruneFailed.addSample(deadSessions.size());
XLOGF(ERR, "ScanTask-{} prune failed, error {}", shard_, result.error());
CO_RETURN_ERROR(result);
}
total += deadSessions.size();
pruned.addSample(deadSessions.size());
}
}
auto finished = prune_->finished.fetch_add(1) + 1;
if (finished < FileSession::kShard) {
co_return total;
}
while (!prune_->sessions.rlock()->empty()) {
static constexpr size_t kBatch = 64;
std::vector<FileSession> batch;
batch.reserve(kBatch);
auto wlock = prune_->sessions.wlock();
auto iter = wlock->begin();
while (iter != wlock->end() && batch.size() < kBatch) {
XLOGF(INFO, "Need prune session {}", iter->second);
batch.push_back(iter->second);
iter = wlock->erase(iter);
}
wlock.unlock();
auto txn = manager.kvEngine_->createReadWriteTransaction();
auto prune = co_await kv::WithTransaction(kv::FDBRetryStrategy{})
.run(std::move(txn), [&](IReadWriteTransaction &txn) -> CoTryTask<Void> {
for (auto &session : batch) {
CO_RETURN_ON_ERROR(co_await session.remove(txn));
}
co_return Void{};
});
if (prune.hasError()) {
pruneFailed.addSample(batch.size());
XLOGF(WARN, "Prune session failed, error {}", prune.error());
}
}
co_return total;
}
/** SessionManager::CloseTask */
CoTryTask<void> SessionManager::CloseTask::run(SessionManager &manager) {
XLOGF_IF(FATAL, !manager.close_, "close_ not set");
auto req = CloseReq({}, session_.inodeId, SessionInfo(session_.clientId, session_.sessionId), true, {}, {});
req.client = session_.clientId;
req.pruneSession = true;
auto close = co_await manager.close_(req);
if (!close.hasError()) {
pruned.addSample(1);
co_return Void{};
}
if (close.error().code() != MetaCode::kNotFound) {
XLOGF(ERR, "SessionManager failed to close {}, error {}", req.inode, close.error());
pruneFailed.addSample(1);
} else {
pruned.addSample(1);
}
auto txn = manager.kvEngine_->createReadWriteTransaction();
auto prune = co_await kv::WithTransaction(kv::FDBRetryStrategy{})
.run(std::move(txn), folly::partial(&FileSession::remove, &session_));
if (prune.hasError()) {
pruneFailed.addSample(1);
XLOGF(WARN, "Prune session {} failed, error {}", session_, prune.error());
}
co_return prune;
}
/** SessionManager */
void SessionManager::start(CPUExecutorGroup &exec) {
XLOGF(DBG, "SessionManager start");
closeWorkers_ = std::make_unique<CoroutinesPool<std::unique_ptr<CloseTask>>>(config_.close_workers());
closeWorkers_->start([&](auto task) -> CoTask<void> { co_await task->run(*this); }, exec);
scanWorkers_ = std::make_unique<CoroutinesPool<std::unique_ptr<ScanTask>>>(config_.scan_workers());
scanWorkers_->start([&](auto task) -> CoTask<void> { co_await task->run(*this); }, exec);
scanRunner_ = std::make_unique<BackgroundRunner>(&exec.pickNext());
scanRunner_->start("SessionScan", folly::partial(&SessionManager::scanTask, this), config_.scan_interval_getter());
XLOGF(INFO, "SessionManager started!");
}
void SessionManager::stopAndJoin() {
XLOGF(DBG, "SessionManager stop.");
if (scanRunner_) {
folly::coro::blockingWait(scanRunner_->stopAll());
scanRunner_.reset();
}
if (scanWorkers_) {
scanWorkers_->stopAndJoin();
scanWorkers_.reset();
}
if (closeWorkers_) {
closeWorkers_->stopAndJoin();
closeWorkers_.reset();
}
XLOGF(INFO, "SessionManager stopped.");
}
CoTask<void> SessionManager::scanTask() {
if (!config_.enable()) {
XLOGF_EVERY_MS(INFO, 10000, "SessionManager scan disabled");
co_return;
}
if (!isFirstMeta(*mgmtd_, nodeId_)) {
co_return;
}
XLOGF(INFO, "MetaServer {} is first active meta, scan sessions", nodeId_);
auto prune = co_await loadPrune();
if (!prune) {
XLOGF(ERR, "Failed to load sessions need to be pruned");
co_return;
}
for (size_t shard = 0; shard < FileSession::kShard; shard++) {
co_await scanWorkers_->enqueue(std::make_unique<ScanTask>(shard, *prune));
}
co_return;
}
CoTryTask<std::shared_ptr<SessionManager::PruneSessions>> SessionManager::loadPrune() {
auto result = co_await kv::WithTransaction(kv::FDBRetryStrategy{})
.run(kvEngine_->createReadonlyTransaction(), [&](auto &txn) -> CoTryTask<std::vector<FileSession>> {
co_return co_await FileSession::listPrune(txn, 128 << 10 /* at most 128k sessions to prune */);
});
CO_RETURN_ON_ERROR(result);
XLOGF_IF(INFO, !result->empty(), "SessionManager found {} sessions to prune", result->size());
auto prune = std::make_shared<PruneSessions>();
auto guard = prune->sessions.wlock();
for (auto &session : *result) {
guard->emplace(session.sessionId, session);
}
co_return prune;
}
CoTryTask<std::vector<FileSession>> SessionManager::listSessions() {
// todo: should we add this?
std::vector<FileSession> sessions;
for (size_t shard = 0; shard < FileSession::kShard; shard++) {
std::optional<FileSession> prev;
while (true) {
auto result =
co_await kv::WithTransaction(kv::FDBRetryStrategy{})
.run(kvEngine_->createReadonlyTransaction(), [&](auto &txn) -> CoTryTask<std::vector<FileSession>> {
co_return co_await FileSession::scan(txn, shard, prev);
});
CO_RETURN_ON_ERROR(result);
if (result->empty()) {
break;
}
prev = result->back();
sessions.insert(sessions.end(), result->begin(), result->end());
}
}
co_return sessions;
}
CoTryTask<std::vector<FileSession>> SessionManager::listSessions(InodeId inodeId) {
auto txn = kvEngine_->createReadonlyTransaction();
auto handler = [&](IReadOnlyTransaction &txn) { return FileSession::list(txn, inodeId, true); };
co_return co_await kv::WithTransaction<kv::FDBRetryStrategy>({}).run(std::move(txn), handler);
}
CoTryTask<size_t> SessionManager::pruneManually() {
XLOGF(INFO, "SessionManager pruneManually");
auto prune = co_await loadPrune();
CO_RETURN_ON_ERROR(prune);
size_t total = 0;
for (size_t shard = 0; shard < FileSession::kShard; shard++) {
auto task = ScanTask(shard, *prune);
auto result = co_await task.run(*this);
CO_RETURN_ON_ERROR(result);
total += *result;
}
co_return total;
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,145 @@
#pragma once
#include <cassert>
#include <fmt/core.h>
#include <folly/Executor.h>
#include <folly/Synchronized.h>
#include <folly/functional/Invoke.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>
#include <utility>
#include <vector>
#include "client/mgmtd/IMgmtdClientForServer.h"
#include "common/app/ClientId.h"
#include "common/app/NodeId.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/serde/Serde.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "common/utils/String.h"
#include "common/utils/UtcTime.h"
#include "common/utils/Uuid.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "meta/store/FileSession.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
class FileHelper;
class SessionManager {
public:
class Config : public ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(enable, true);
CONFIG_HOT_UPDATED_ITEM(scan_interval, 5_min);
CONFIG_HOT_UPDATED_ITEM(scan_batch, 1024u);
CONFIG_HOT_UPDATED_ITEM(sync_on_prune_session, false);
CONFIG_HOT_UPDATED_ITEM(session_timeout, 5_min);
CONFIG_OBJ(scan_workers, CoroutinesPoolBase::Config, [](auto &c) {
c.set_coroutines_num(8);
c.set_queue_size(128);
});
CONFIG_OBJ(close_workers, CoroutinesPoolBase::Config, [](auto &c) {
c.set_coroutines_num(32);
c.set_queue_size(1024);
});
};
SessionManager(const Config &cfg,
flat::NodeId nodeId,
std::shared_ptr<kv::IKVEngine> kvEngine,
std::shared_ptr<client::ICommonMgmtdClient> mgmtd,
std::shared_ptr<FileHelper> fileHelper)
: config_(cfg),
nodeId_(nodeId),
kvEngine_(kvEngine),
mgmtd_(mgmtd),
fileHelper_(fileHelper) {}
~SessionManager() { stopAndJoin(); }
void start(CPUExecutorGroup &exec);
void stopAndJoin();
using CloseFunc = std::function<CoTryTask<void>(const meta::CloseReq &)>;
void setCloseFunc(CloseFunc close) { close_ = close; }
// for admin_cli
CoTryTask<std::vector<FileSession>> listSessions();
CoTryTask<std::vector<FileSession>> listSessions(InodeId inodeId);
CoTryTask<size_t> pruneManually();
private:
struct PruneSessions {
std::atomic<size_t> finished = 0;
folly::Synchronized<std::map<Uuid, FileSession>> sessions;
};
class ScanTask {
public:
ScanTask(size_t shard, std::shared_ptr<PruneSessions> prune)
: shard_(shard),
prune_(prune) {}
CoTryTask<size_t> run(SessionManager &manager);
private:
size_t shard_ = -1;
std::shared_ptr<PruneSessions> prune_;
};
// try to close and sync
class CloseTask {
public:
CloseTask(FileSession session)
: session_(std::move(session)) {}
CoTryTask<void> run(SessionManager &manager);
private:
FileSession session_;
};
CoTask<void> scanTask();
CoTryTask<std::shared_ptr<PruneSessions>> loadPrune();
const Config &config_;
flat::NodeId nodeId_;
std::shared_ptr<kv::IKVEngine> kvEngine_;
std::shared_ptr<client::ICommonMgmtdClient> mgmtd_;
std::shared_ptr<FileHelper> fileHelper_;
std::unique_ptr<BackgroundRunner> scanRunner_;
std::unique_ptr<CoroutinesPool<std::unique_ptr<ScanTask>>> scanWorkers_;
std::unique_ptr<CoroutinesPool<std::unique_ptr<CloseTask>>> closeWorkers_;
CloseFunc close_;
};
} // namespace hf3fs::meta::server
FMT_BEGIN_NAMESPACE
template <>
struct formatter<hf3fs::meta::server::FileSession> : formatter<std::string_view> {
template <typename FormatContext>
auto format(const hf3fs::meta::server::FileSession &session, FormatContext &ctx) const {
return format_to(ctx.out(),
"{{inodeId {}, client {}, session {}}}",
session.inodeId,
session.clientId,
session.sessionId);
}
};
FMT_END_NAMESPACE

71
src/meta/event/Event.cc Normal file
View File

@@ -0,0 +1,71 @@
#include "meta/event/Event.h"
#include <cassert>
#include <folly/json.h>
#include <folly/logging/Logger.h>
#include <folly/logging/LoggerDB.h>
#include <folly/logging/xlog.h>
#include "common/utils/MagicEnum.hpp"
#include "common/utils/Result.h"
namespace hf3fs::meta::server {
namespace {
folly::Logger create("eventlog.Create");
folly::Logger mkdir("eventlog.Mkdir");
folly::Logger hardLink("eventlog.HardLink");
folly::Logger remove("eventlog.Remove");
folly::Logger truncate("eventlog.Truncate");
folly::Logger openWrite("eventlog.OpenWrite");
folly::Logger closeWrite("eventlog.CloseWrite");
folly::Logger rename("eventlog.Rename");
folly::Logger symlink("eventlog.Symlink");
folly::Logger gc("eventlog.GC");
folly::Logger unknown("eventlog.Unknown");
} // namespace
static folly::Logger &getLogger(Event::Type type) {
switch (type) {
case Event::Type::Create:
return create;
case Event::Type::Mkdir:
return mkdir;
case Event::Type::HardLink:
return hardLink;
case Event::Type::Remove:
return remove;
case Event::Type::Truncate:
return truncate;
case Event::Type::OpenWrite:
return openWrite;
case Event::Type::CloseWrite:
return closeWrite;
case Event::Type::Rename:
return rename;
case Event::Type::Symlink:
return symlink;
case Event::Type::GC:
return gc;
}
XLOGF(DFATAL, "Unknown type {}", (int)type);
return unknown;
}
void Event::log() const {
folly::json::serialization_opts opts;
opts.pretty_formatting = false;
opts.sort_keys = false;
return log(opts);
}
void Event::log(const folly::json::serialization_opts &opts) const {
try {
auto msg = folly::json::serialize(data, opts);
auto logger = getLogger(type);
FB_LOG(logger, INFO, msg);
} catch (folly::json::print_error &exception) {
XLOGF(ERR, "Event failed to serialize to json, type {}, error {}", magic_enum::enum_name(type), exception.what());
}
}
} // namespace hf3fs::meta::server

74
src/meta/event/Event.h Normal file
View File

@@ -0,0 +1,74 @@
#pragma once
#include <algorithm>
#include <cstdint>
#include <folly/Utility.h>
#include <folly/dynamic.h>
#include <folly/hash/Checksum.h>
#include <folly/json.h>
#include <folly/logging/Logger.h>
#include <folly/logging/xlog.h>
#include <limits>
#include <string_view>
#include <utility>
#include <vector>
#include "common/serde/Serde.h"
#include "common/utils/MagicEnum.hpp"
#include "common/utils/Path.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fbs/mgmtd/MgmtdTypes.h"
namespace hf3fs::meta::server {
struct Event {
enum class Type { Create, Mkdir, HardLink, Remove, Truncate, OpenWrite, CloseWrite, Rename, Symlink, GC };
Type type;
folly::dynamic data;
Event(Type type)
: Event(type, folly::dynamic::object()) {
addField("event", magic_enum::enum_name(type));
addField("ts", UtcClock::now().toMicroseconds());
}
Event(Type type, folly::dynamic data)
: type(type),
data(std::move(data)) {}
void log() const;
void log(const folly::json::serialization_opts &opts) const;
Event &addField(folly::dynamic key, folly::dynamic value) {
data.insert(std::move(key), std::move(value));
return *this;
}
};
struct MetaEventTrace {
SERDE_STRUCT_FIELD(eventType, Event::Type::Create);
SERDE_STRUCT_FIELD(inodeId, InodeId());
SERDE_STRUCT_FIELD(parentId, InodeId());
SERDE_STRUCT_FIELD(entryName, std::string());
SERDE_STRUCT_FIELD(dstParentId, InodeId());
SERDE_STRUCT_FIELD(dstEntryName, std::string());
SERDE_STRUCT_FIELD(ownerId, Uid(0));
SERDE_STRUCT_FIELD(userId, Uid());
SERDE_STRUCT_FIELD(client, ClientId{Uuid::zero()});
SERDE_STRUCT_FIELD(tableId, flat::ChainTableId());
SERDE_STRUCT_FIELD(inodeType, InodeType::File);
SERDE_STRUCT_FIELD(nlink, uint16_t(0));
SERDE_STRUCT_FIELD(length, uint64_t(0));
SERDE_STRUCT_FIELD(truncateVer, uint64_t(0));
SERDE_STRUCT_FIELD(dynStripe, uint32_t(0));
SERDE_STRUCT_FIELD(oflags, OpenFlags());
SERDE_STRUCT_FIELD(recursiveRemove, false);
SERDE_STRUCT_FIELD(removedChunks, size_t(0));
SERDE_STRUCT_FIELD(pruneSession, false);
SERDE_STRUCT_FIELD(symLinkTarget, Path());
SERDE_STRUCT_FIELD(origPath, Path());
};
} // namespace hf3fs::meta::server

285
src/meta/event/Scan.cc Normal file
View File

@@ -0,0 +1,285 @@
#include "meta/event/Scan.h"
#include <algorithm>
#include <chrono>
#include <fmt/format.h>
#include <folly/Likely.h>
#include <folly/Synchronized.h>
#include <folly/experimental/coro/AsyncGenerator.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/futures/Future.h>
#include <folly/init/Init.h>
#include <folly/logging/xlog.h>
#include <iterator>
#include <memory>
#include <mutex>
#include <stdexcept>
#include <string>
#include <thread>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/logging/LogInit.h"
#include "common/utils/Coroutine.h"
#include "common/utils/ExponentialBackoffRetry.h"
#include "common/utils/Result.h"
#include "fdb/FDB.h"
#include "fdb/FDBKVEngine.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
static ExponentialBackoffRetry createBackoff(const MetaScan::Options &options) {
return ExponentialBackoffRetry(std::chrono::milliseconds((int)(options.backoff_min_wait * 1000)),
std::chrono::milliseconds((int)(options.backoff_max_wait * 1000)),
std::chrono::milliseconds((int)(options.backoff_total_wait * 1000)));
}
CoTryTask<kv::IReadOnlyTransaction::GetRangeResult> MetaScan::KeyRange::snapshotGetRange(kv::IReadOnlyTransaction &txn,
int32_t limit) {
XLOGF(DBG, "MetaScan snapshotGetRange {}", describe());
auto result = co_await txn.snapshotGetRange({begin, true}, {end, false}, limit);
CO_RETURN_ON_ERROR(result);
hasMore = result->hasMore;
if (!result->kvs.empty()) {
begin = kv::TransactionHelper::keyAfter(result->kvs.rbegin()->key);
}
co_return result;
}
std::vector<MetaScan::KeyRange> MetaScan::KeyRange::split(std::string prefix) {
std::vector<KeyRange> ranges;
unsigned char c = 0;
do {
std::string begin = prefix + (char)c;
std::string end;
if (c != 0xff) {
end = prefix + (char)(c + 1);
} else {
end = kv::TransactionHelper::prefixListEndKey(prefix);
}
ranges.push_back({begin, end});
c += 1;
} while (c != 0);
return ranges;
}
MetaScan::MetaScan(Options options, std::shared_ptr<kv::IKVEngine> kvEngine)
: options_(options),
kvEngine_(kvEngine),
exec_(std::pair<size_t, size_t>{options.threads, options.threads},
std::make_shared<folly::NamedThreadFactory>("Scan")) {
if (options_.threads < 0 || options_.coroutines < 0) {
throw std::runtime_error("Invalid options, thread < 0 or coroutines < 0");
}
if (!kvEngine && options_.fdb_cluster_file.empty()) {
throw std::runtime_error("Should set kvEngine or fdb cluster file");
}
if (!options_.logging.empty()) {
XLOGF(INFO, "Setup log: {}", options_.logging);
logging::initOrDie(options_.logging);
}
createKVEngine();
}
MetaScan::~MetaScan() {
if (scanInodeTask_.has_value()) {
scanInodeTask_->cancel.requestCancellation();
}
if (scanDirEntryTask_.has_value()) {
scanDirEntryTask_->cancel.requestCancellation();
}
exec_.stop();
if (fdbNetwork_) {
kv::fdb::DB::stopNetwork();
fdbNetwork_->join();
}
}
void MetaScan::createKVEngine() {
if (kvEngine_) {
return;
}
kv::fdb::DB::selectAPIVersion(FDB_API_VERSION);
auto error = kv::fdb::DB::setupNetwork();
if (error) {
throw std::runtime_error(fmt::format("Failed to setup fdb network, error {}", kv::fdb::DB::errorMsg(error)));
}
fdbNetwork_ = std::jthread([&]() { kv::fdb::DB::runNetwork(); });
kvEngine_ = std::make_shared<kv::FDBKVEngine>(kv::fdb::DB(options_.fdb_cluster_file, true /* readonly */));
}
std::vector<Inode> MetaScan::getInodes() {
std::scoped_lock<std::mutex> lock(mutex_);
if (!scanInodeTask_) {
scanInodeTask_.emplace(256);
scanInodeTask_->future = scanInode(*scanInodeTask_).scheduleOn(&exec_).start();
}
return waitResult(scanInodeTask_);
}
std::vector<DirEntry> MetaScan::getDirEntries() {
std::scoped_lock<std::mutex> lock(mutex_);
if (!scanDirEntryTask_) {
scanDirEntryTask_.emplace(256);
scanDirEntryTask_->future = scanDirEntry(*scanDirEntryTask_).scheduleOn(&exec_).start();
}
return waitResult(scanDirEntryTask_);
}
template <typename T>
std::vector<T> MetaScan::waitResult(std::optional<BackgroundTask<T>> &task) {
std::vector<T> vec;
while (true) {
// dequeue
while (true) {
auto result = task->queue.try_dequeue();
if (!result.has_value()) {
break;
}
if (vec.empty()) {
vec = std::move(*result);
} else {
vec.insert(vec.end(), std::make_move_iterator(result->begin()), std::make_move_iterator(result->end()));
}
}
// return items
if (vec.size() > 64) {
return vec;
}
if (task->future.isReady()) {
if (!vec.empty()) {
return vec;
}
if (task->future.valid()) {
auto &result = task->future.result();
if (result.value().hasError()) {
throw std::runtime_error(result.value().error().describe());
}
}
return {};
}
std::this_thread::sleep_for(10_ms);
}
}
template <typename T>
CoTryTask<void> MetaScan::scanRange(KeyRange range, BackgroundTask<T> &task) {
auto originRange = range;
XLOGF(INFO, "Worker scan range {}", originRange.describe());
size_t total = 0;
auto txn = kvEngine_->createReadonlyTransaction();
auto txnCreateTime = RelativeTime::now();
while (range.hasMore) {
auto retry = createBackoff(options_);
while (true) {
// todo: tune FDB transaction get range mode
auto result = co_await range.snapshotGetRange(*txn, options_.items_per_getrange);
// handle error
if (result.hasError()) {
auto wait = retry.getWaitTime();
if (wait.count() == 0) {
XLOGF(ERR, "Failed to get range {} after retry {}ms", range.describe(), retry.getElapsedTime().count());
CO_RETURN_ERROR(result);
}
if (result.error().code() == TransactionCode::kTooOld) {
txnCreateTime = RelativeTime::now();
txn->reset();
}
continue;
}
// deserialize and continue
std::vector<T> items;
items.reserve(result->kvs.size());
for (auto [key, value] : result->kvs) {
auto unpacked = T::newUnpacked(key, value);
if (unpacked.hasError()) {
// todo: maybe should return error here, or give caller a statistic data?
XLOGF(FATAL, "Failed to deserialize key {:02x}, value {:02x}", fmt::join(key, ""), fmt::join(value, ""));
} else {
items.emplace_back(std::move(*unpacked));
}
}
if (!items.empty()) {
total += items.size();
co_await folly::coro::co_withCancellation(task.cancel.getToken(), task.queue.enqueue(std::move(items)));
}
// reset transaction to avoid transaction too old
if (RelativeTime::now() - txnCreateTime > std::chrono::seconds(3)) {
txnCreateTime = RelativeTime::now();
txn->reset();
}
break;
}
}
XLOGF(INFO, "Worker finished scan range {}, found {} kvs", originRange.describe(), total);
co_return Void{};
}
template <typename T>
CoTryTask<void> MetaScan::scan(kv::KeyPrefix prefix, BackgroundTask<T> &task) {
auto prefixStr = std::string(kv::toStr(prefix));
auto ranges = KeyRange::split(prefixStr);
folly::Synchronized<std::queue<KeyRange>, std::mutex> taskQueue;
for (auto &range : ranges) {
taskQueue.lock()->push(range);
}
auto exec = co_await folly::coro::co_current_executor;
std::vector<folly::SemiFuture<Result<Void>>> workers;
for (auto i = 0; i < options_.coroutines; i++) {
auto worker = folly::coro::co_invoke([&]() -> CoTryTask<void> {
KeyRange range;
while (true) {
{
auto guard = taskQueue.lock();
if (guard->empty()) {
co_return Void{};
}
range = guard->front();
guard->pop();
}
auto result = co_await scanRange(range, task);
CO_RETURN_ON_ERROR(result);
}
co_return Void{};
});
workers.push_back(std::move(worker).scheduleOn(exec).start());
}
auto results = co_await folly::coro::collectAllRange(std::move(workers));
for (auto result : results) {
CO_RETURN_ON_ERROR(result);
}
co_return Void{};
}
CoTryTask<void> MetaScan::scanInode(BackgroundTask<Inode> &task) {
co_return co_await scan(kv::KeyPrefix::Inode, task);
}
CoTryTask<void> MetaScan::scanDirEntry(BackgroundTask<DirEntry> &task) {
co_return co_await scan(kv::KeyPrefix::Dentry, task);
}
} // namespace hf3fs::meta::server

115
src/meta/event/Scan.h Normal file
View File

@@ -0,0 +1,115 @@
#pragma once
#include <algorithm>
#include <cmath>
#include <exception>
#include <fmt/core.h>
#include <folly/CancellationToken.h>
#include <folly/Function.h>
#include <folly/MPMCQueue.h>
#include <folly/concurrency/UnboundedQueue.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/experimental/coro/BoundedQueue.h>
#include <folly/futures/Future.h>
#include <memory>
#include <optional>
#include <stdexcept>
#include <vector>
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
class MetaScan {
public:
struct Options {
// scan options
int threads = 4;
int coroutines = 8;
int items_per_getrange = -1;
double backoff_min_wait = 0.1; // 100ms
double backoff_max_wait = 5; // 5s
double backoff_total_wait = 60; // 60s
// log level
std::string logging;
// create FDB client with given config path
std::string fdb_cluster_file;
};
MetaScan(Options options,
std::shared_ptr<kv::IKVEngine> kvEngine = {} /* create new fdb client if kvEngine is not set */);
~MetaScan();
std::vector<Inode> getInodes();
std::vector<DirEntry> getDirEntries();
kv::IKVEngine &kvEngine() { return *kvEngine_; }
private:
struct KeyRange {
std::string begin;
std::string end;
bool hasMore;
KeyRange()
: begin(),
end(),
hasMore(false) {}
KeyRange(std::string begin, std::string end)
: begin(std::move(begin)),
end(std::move(end)),
hasMore(true) {}
static std::vector<KeyRange> split(std::string prefix);
CoTryTask<kv::IReadOnlyTransaction::GetRangeResult> snapshotGetRange(kv::IReadOnlyTransaction &txn, int32_t limit);
std::string describe() const {
return fmt::format("[begin {:02x}, end {:02x}, hasMore {}]", fmt::join(begin, ""), fmt::join(end, ""), hasMore);
}
};
template <typename T>
struct BackgroundTask {
folly::coro::BoundedQueue<std::vector<T>> queue;
folly::SemiFuture<Result<Void>> future;
folly::CancellationSource cancel;
BackgroundTask(size_t cap)
: queue(cap),
future(folly::SemiFuture<Result<Void>>::makeEmpty()),
cancel() {}
};
void createKVEngine();
template <typename T>
std::vector<T> waitResult(std::optional<BackgroundTask<T>> &task);
CoTryTask<void> scanInode(BackgroundTask<Inode> &task);
CoTryTask<void> scanDirEntry(BackgroundTask<DirEntry> &task);
template <typename T>
CoTryTask<void> scan(kv::KeyPrefix prefix, BackgroundTask<T> &task);
template <typename T>
CoTryTask<void> scanRange(KeyRange range, BackgroundTask<T> &task);
std::mutex mutex_;
Options options_;
std::optional<std::jthread> fdbNetwork_;
std::shared_ptr<kv::IKVEngine> kvEngine_;
folly::CPUThreadPoolExecutor exec_;
std::optional<BackgroundTask<Inode>> scanInodeTask_;
std::optional<BackgroundTask<DirEntry>> scanDirEntryTask_;
};
} // namespace hf3fs::meta::server

8
src/meta/meta.cpp Normal file
View File

@@ -0,0 +1,8 @@
#include "common/app/TwoPhaseApplication.h"
#include "memory/common/OverrideCppNewDelete.h"
#include "meta/service/MetaServer.h"
int main(int argc, char *argv[]) {
using namespace hf3fs;
return TwoPhaseApplication<meta::server::MetaServer>().run(argc, argv);
}

View File

@@ -0,0 +1,439 @@
#include "meta/service/MetaOperator.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <fcntl.h>
#include <fmt/core.h>
#include <folly/Conv.h>
#include <folly/Expected.h>
#include <folly/Overload.h>
#include <folly/Random.h>
#include <folly/ScopeGuard.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/experimental/coro/Sleep.h>
#include <folly/functional/Invoke.h>
#include <folly/functional/Partial.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <memory>
#include <optional>
#include <type_traits>
#include <unistd.h>
#include <utility>
#include <vector>
#include "common/app/NodeId.h"
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/monitor/Recorder.h"
#include "common/serde/ClientContext.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Coroutine.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Duration.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#include "common/utils/RobinHood.h"
#include "common/utils/UtcTime.h"
#include "core/user/UserToken.h"
#include "fbs/meta/Service.h"
#include "fbs/meta/Utils.h"
#include "fdb/FDBRetryStrategy.h"
#include "meta/components/ChainAllocator.h"
#include "meta/components/Distributor.h"
#include "meta/components/FileHelper.h"
#include "meta/components/Forward.h"
#include "meta/components/InodeIdAllocator.h"
#include "meta/components/SessionManager.h"
#include "meta/store/Idempotent.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/PathResolve.h"
#include "meta/store/Utils.h"
#include "meta/store/ops/BatchOperation.h"
#define AUTHENTICATE(user) \
do { \
if (config_.authenticate()) { \
CO_RETURN_ON_ERROR(co_await authenticate(user)); \
} \
} while (0)
namespace hf3fs::meta::server {
using namespace std::chrono_literals;
template <typename Func, typename Arg>
auto MetaOperator::runOp(Func &&func, Arg &&arg)
-> CoTryTask<typename std::invoke_result_t<Func, MetaStore, Arg &&>::element_type::RspT> {
#ifndef NDEBUG
auto fi = FaultInjection::clone();
#endif
auto deadline = std::optional<SteadyTime>();
if constexpr (std::is_base_of_v<ReqBase, std::remove_reference_t<Arg>>) {
CO_RETURN_ON_ERROR(arg.valid());
if (config_.operation_timeout() != 0_s) {
deadline = SteadyClock::now() + config_.operation_timeout();
}
}
auto txn = kvEngine_->createReadWriteTransaction();
auto op = ((*metaStore_).*func)(std::forward<Arg>(arg));
auto driver = OperationDriver(*op, arg, deadline);
co_return co_await driver.run(std::move(txn), createRetryConfig(), config_.readonly(), config_.grv_cache());
}
CoTryTask<Inode> MetaOperator::runBatch(InodeId inodeId,
std::unique_ptr<BatchedOp> op,
std::optional<SteadyTime> deadline) {
#ifndef NDEBUG
auto fi = FaultInjection::clone();
#endif
assert(op);
auto txn = kvEngine_->createReadWriteTransaction();
auto driver = OperationDriver(*op, Void{}, deadline);
auto result = co_await driver.run(std::move(txn), createRetryConfig(), config_.readonly(), config_.grv_cache());
if (!result.hasError()) {
XLOGF_IF(FATAL, inodeId != result->id, "expected {}, get {}", inodeId, result->id);
}
batches_.withLock(
[&](auto &map) {
auto iter = map.find(op->inodeId_);
XLOGF_IF(FATAL, iter == map.end(), "shouldn't happen");
if (!iter->second.wakeupNext()) {
map.erase(iter);
}
},
op->inodeId_);
co_return result;
}
template <typename Req, typename Rsp>
CoTryTask<Rsp> MetaOperator::runInBatch(InodeId inodeId, Req req) {
CO_RETURN_ON_ERROR(req.valid());
auto deadline = std::optional<SteadyTime>();
if (config_.operation_timeout() != 0_s) {
deadline = SteadyClock::now() + config_.operation_timeout();
}
OperationRecorder::Guard guard(OperationRecorder::server(), MetaSerde<>::getRpcName(req), req.user.uid);
BatchedOp::Waiter<Req, Rsp> waiter(std::move(req));
auto op = addBatchReq(inodeId, waiter);
co_await waiter.baton;
if (op) {
co_await runBatch(inodeId, std::move(op), deadline);
}
auto result = waiter.getResult();
guard.finish(result);
co_return result;
}
MetaOperator::MetaOperator(const Config &cfg,
flat::NodeId nodeId,
std::shared_ptr<kv::IKVEngine> kvEngine,
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient,
std::shared_ptr<storage::client::StorageClient> storageClient,
std::unique_ptr<Forward> forward)
: config_(cfg),
nodeId_(nodeId),
metaEventTraceLog_(config_.event_trace_log()),
kvEngine_(kvEngine),
mgmtd_(mgmtdClient),
distributor_(std::make_shared<Distributor>(cfg.distributor(), nodeId, kvEngine)),
userStore_(std::make_shared<core::UserStoreEx>(*kvEngine_, config_.retry_transaction(), config_.user_cache())),
inodeIdAlloc_(InodeIdAllocator::create(kvEngine)),
chainAlloc_(std::make_shared<ChainAllocator>(mgmtdClient)),
fileHelper_(std::make_shared<FileHelper>(cfg, mgmtdClient, storageClient)),
sessionManager_(
std::make_shared<SessionManager>(cfg.session_manager(), nodeId, kvEngine_, mgmtdClient, fileHelper_)),
gcManager_(std::make_shared<GcManager>(cfg,
nodeId,
metaEventTraceLog_,
kvEngine_,
mgmtdClient,
inodeIdAlloc_,
fileHelper_,
sessionManager_,
userStore_)),
forward_(std::move(forward)),
metaStore_(std::make_unique<MetaStore>(cfg,
metaEventTraceLog_,
distributor_,
inodeIdAlloc_,
chainAlloc_,
fileHelper_,
sessionManager_,
gcManager_)) {
sessionManager_->setCloseFunc(
[&](const auto &req) -> CoTryTask<void> { co_return (co_await close(req)).then([](auto &) { return Void{}; }); });
}
CoTryTask<void> MetaOperator::init(std::optional<Layout> layout) {
XLOGF(INFO, "MetaOperator::init");
if (layout.has_value()) {
CO_RETURN_ON_ERROR(co_await runOp(&MetaStore::initFs, *layout));
}
if (!metaEventTraceLog_.open()) {
XLOGF(CRITICAL, "Failed to open trace log in directory: {}", config_.event_trace_log().trace_file_dir());
co_return makeError(StatusCode::kIOError);
}
CO_RETURN_ON_ERROR(co_await gcManager_->init());
XLOGF(INFO, "MetaOperator::init success.");
co_return Void{};
}
void MetaOperator::start(CPUExecutorGroup &exec) {
XLOGF(INFO, "MetaOperator::start");
distributor_->start(exec);
fileHelper_->start(exec);
gcManager_->start(exec);
sessionManager_->start(exec);
bgRunner_ = std::make_unique<BackgroundRunner>(&exec.randomPick());
bgRunner_->start(
"idempotent_clean",
[&]() -> CoTask<void> {
if (!isFirstMeta(*mgmtd_, nodeId_)) co_return;
auto prev = std::optional<std::string>();
size_t total = 0, cleaned = 0;
auto more = true;
while (more && !stop_) {
size_t t = 0, c = 0;
auto strategy = kv::FDBRetryStrategy(createRetryConfig());
auto txn = kvEngine_->createReadWriteTransaction();
auto result = co_await kv::WithTransaction(strategy).run(
std::move(txn),
[&](kv::IReadWriteTransaction &txn) -> CoTryTask<std::pair<std::string, bool>> {
co_return co_await Idempotent::clean(txn, prev, config_.idempotent_record_expire(), 2048, t, c);
});
if (!result) {
XLOGF(ERR, "Clean idempotent record failed, {}", result.error());
break;
}
total += t;
cleaned += c;
prev = result->first;
more = result->second;
}
XLOGF(INFO, "Clean idempotent record, total {}, cleaned {}", total, cleaned);
co_return;
},
config_.idempotent_record_clean_getter());
}
void MetaOperator::beforeStop() {
XLOGF(INFO, "MetaOperator::beforeStop");
stop_ = true;
if (distributor_) {
distributor_->stopAndJoin(true);
}
XLOGF(INFO, "MetaOperator::beforeStop finished");
}
void MetaOperator::afterStop() {
XLOGF(INFO, "MetaOperator::afterStop");
if (bgRunner_) {
folly::coro::blockingWait(bgRunner_->stopAll());
bgRunner_.reset();
}
if (gcManager_) {
gcManager_->stopAndJoin();
}
if (sessionManager_) {
sessionManager_->stopAndJoin();
}
if (fileHelper_) {
fileHelper_->stopAndJoin();
}
metaEventTraceLog_.close();
XLOGF(INFO, "MetaOperator::afterStop finished");
}
kv::FDBRetryStrategy::Config MetaOperator::createRetryConfig() const {
return kv::FDBRetryStrategy::Config{config_.retry_transaction().max_backoff(),
config_.retry_transaction().max_retry_count(),
true};
}
CoTryTask<void> MetaOperator::authenticate(UserInfo &userInfo) {
static monitor::CountRecorder failed("meta_server.auth_failed");
auto guard = folly::makeGuard([&]() {
failed.addSample(1, {{"uid", folly::to<std::string>(userInfo.uid.toUnderType())}});
});
auto ret = co_await userStore_->authenticate(userInfo);
CO_RETURN_ON_ERROR(ret);
guard.dismiss();
co_return Void{};
}
CoTryTask<AuthRsp> MetaOperator::authenticate(AuthReq req) {
AUTHENTICATE(req.user);
co_return AuthRsp(std::move(req.user));
}
CoTryTask<StatFsRsp> MetaOperator::statFs(StatFsReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::statFs, req);
}
CoTryTask<StatRsp> MetaOperator::stat(StatReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::stat, req);
}
CoTryTask<BatchStatRsp> MetaOperator::batchStat(BatchStatReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::batchStat, req);
}
CoTryTask<BatchStatByPathRsp> MetaOperator::batchStatByPath(BatchStatByPathReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::batchStatByPath, req);
}
CoTryTask<GetRealPathRsp> MetaOperator::getRealPath(GetRealPathReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::getRealPath, req);
}
CoTryTask<OpenRsp> MetaOperator::open(OpenReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::open, req);
}
CoTryTask<TruncateRsp> MetaOperator::truncate(TruncateReq req) {
XLOGF(CRITICAL, "truncate is deperated, update client {}", req.client.hostname);
co_return makeError(StatusCode::kNotImplemented, "truncate is deperated, update client");
}
CoTryTask<SyncRsp> MetaOperator::sync(SyncReq req) {
// NOTE: don't auth user for sync
auto node = distributor_->getServer(req.inode);
if (node == distributor_->nodeId()) {
auto inodeId = req.inode;
co_return co_await runInBatch<SyncReq, SyncRsp>(inodeId, std::move(req));
} else {
co_return co_await forward_->forward<SyncReq, SyncRsp>(node, std::move(req));
}
}
CoTryTask<CloseRsp> MetaOperator::close(CloseReq req) {
// Note: don't auth user here
auto node = distributor_->getServer(req.inode);
if (node == distributor_->nodeId()) {
auto inodeId = req.inode;
co_return co_await runInBatch<CloseReq, CloseRsp>(inodeId, std::move(req));
} else {
co_return co_await forward_->forward<CloseReq, CloseRsp>(node, std::move(req));
}
}
CoTryTask<CreateRsp> MetaOperator::create(CreateReq req) {
AUTHENTICATE(req.user);
CO_RETURN_ON_ERROR(req.valid());
XLOGF(DBG, "create {}", req);
if (req.path.path->has_parent_path()) {
// try open first.
auto result = co_await runOp(&MetaStore::tryOpen, req);
if (result.hasValue() || req.path.path->has_parent_path()) {
co_return result;
}
if (!req.valid()) {
auto msg = fmt::format("req {} not valid after try open", req);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kFoundBug, std::move(msg));
}
XLOGF(DBG, "create {}", req);
}
auto node = distributor_->getServer(req.path.parent);
if (node == distributor_->nodeId()) {
auto parentId = req.path.parent;
co_return co_await runInBatch<CreateReq, CreateRsp>(parentId, std::move(req));
} else {
co_return co_await forward_->forward<CreateReq, CreateRsp>(node, std::move(req));
}
}
CoTryTask<MkdirsRsp> MetaOperator::mkdirs(MkdirsReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::mkdirs, req);
}
CoTryTask<SymlinkRsp> MetaOperator::symlink(SymlinkReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::symlink, req);
}
CoTryTask<RemoveRsp> MetaOperator::remove(RemoveReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::remove, req);
}
CoTryTask<RenameRsp> MetaOperator::rename(RenameReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::rename, req);
}
CoTryTask<ListRsp> MetaOperator::list(ListReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::list, req);
}
CoTryTask<HardLinkRsp> MetaOperator::hardLink(HardLinkReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::hardLink, req);
}
CoTryTask<SetAttrRsp> MetaOperator::setAttr(SetAttrReq req) {
AUTHENTICATE(req.user);
if (req.path.path) {
co_return co_await runOp(&MetaStore::setAttr, req);
}
auto node = distributor_->getServer(req.path.parent);
if (node == distributor_->nodeId()) {
auto parentId = req.path.parent;
co_return co_await runInBatch<SetAttrReq, SetAttrRsp>(parentId, std::move(req));
} else {
co_return co_await forward_->forward<SetAttrReq, SetAttrRsp>(node, std::move(req));
}
}
CoTryTask<LockDirectoryRsp> MetaOperator::lockDirectory(LockDirectoryReq req) {
AUTHENTICATE(req.user);
co_return co_await runOp(&MetaStore::lockDirectory, req);
}
CoTryTask<PruneSessionRsp> MetaOperator::pruneSession(PruneSessionReq req) {
co_return co_await runOp(&MetaStore::pruneSession, req);
}
CoTryTask<DropUserCacheRsp> MetaOperator::dropUserCache(DropUserCacheReq req) {
if (req.dropAll) {
userStore_->cache().clear();
} else if (req.uid) {
userStore_->cache().clear(*req.uid);
}
co_return DropUserCacheRsp{};
}
CoTryTask<TestRpcRsp> MetaOperator::testRpc(TestRpcReq req) {
// don't need auth user
co_return co_await runOp(&MetaStore::testRpc, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,212 @@
#pragma once
#include <arrow/util/macros.h>
#include <atomic>
#include <folly/Likely.h>
#include <folly/Utility.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/experimental/coro/Baton.h>
#include <folly/functional/Invoke.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <memory>
#include <optional>
#include <type_traits>
#include <utility>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/mgmtd/IMgmtdClientForServer.h"
#include "client/storage/StorageClient.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Coroutine.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Result.h"
#include "core/user/UserStoreEx.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fdb/FDBRetryStrategy.h"
#include "meta/base/Config.h"
#include "meta/components/ChainAllocator.h"
#include "meta/components/Distributor.h"
#include "meta/components/FileHelper.h"
#include "meta/components/Forward.h"
#include "meta/components/GcManager.h"
#include "meta/components/SessionManager.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/ops/BatchOperation.h"
namespace hf3fs::meta::server {
class BatchedOp;
class MetaOperator : public folly::NonCopyableNonMovable {
public:
MetaOperator(const Config &cfg,
flat::NodeId nodeId,
std::shared_ptr<kv::IKVEngine> kvEngine,
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient,
std::shared_ptr<storage::client::StorageClient> storageClient,
std::unique_ptr<Forward> forward);
CoTryTask<void> init(std::optional<Layout> rootLayout);
void start(CPUExecutorGroup &exec);
void beforeStop();
void afterStop();
CoTryTask<AuthRsp> authenticate(AuthReq req);
CoTryTask<StatFsRsp> statFs(StatFsReq req);
CoTryTask<StatRsp> stat(StatReq req);
CoTryTask<GetRealPathRsp> getRealPath(GetRealPathReq req);
CoTryTask<OpenRsp> open(OpenReq req);
CoTryTask<CloseRsp> close(CloseReq req);
CoTryTask<CreateRsp> create(CreateReq req);
CoTryTask<MkdirsRsp> mkdirs(MkdirsReq req);
CoTryTask<SymlinkRsp> symlink(SymlinkReq req);
CoTryTask<RemoveRsp> remove(RemoveReq req);
CoTryTask<RenameRsp> rename(RenameReq req);
CoTryTask<ListRsp> list(ListReq req);
CoTryTask<TruncateRsp> truncate(TruncateReq req);
CoTryTask<SyncRsp> sync(SyncReq req);
CoTryTask<HardLinkRsp> hardLink(HardLinkReq req);
CoTryTask<SetAttrRsp> setAttr(SetAttrReq req);
CoTryTask<PruneSessionRsp> pruneSession(PruneSessionReq req);
CoTryTask<DropUserCacheRsp> dropUserCache(DropUserCacheReq req);
CoTryTask<LockDirectoryRsp> lockDirectory(LockDirectoryReq req);
CoTryTask<TestRpcRsp> testRpc(TestRpcReq req);
CoTryTask<BatchStatRsp> batchStat(BatchStatReq req);
CoTryTask<BatchStatByPathRsp> batchStatByPath(BatchStatByPathReq req);
private:
friend class MockMeta;
template <typename>
FRIEND_TEST(TestBatchOp, batch);
template <typename>
FRIEND_TEST(TestCreate, batch);
class Batch {
public:
void setNext(BatchedOp *op, folly::coro::Baton *baton) {
next = op;
nextBaton = baton;
}
bool wakeupNext() {
if (!next) {
return false;
}
nextBaton->post();
next = nullptr;
nextBaton = nullptr;
return true;
}
BatchedOp *getNext() const { return next; }
private:
BatchedOp *next = nullptr;
folly::coro::Baton *nextBaton = nullptr;
};
kv::FDBRetryStrategy::Config createRetryConfig() const;
kv::FDBRetryStrategy createRetryStrategy() const { return kv::FDBRetryStrategy(createRetryConfig()); }
template <typename Func, typename Arg>
auto runOp(Func &&func, Arg &&arg)
-> CoTryTask<typename std::invoke_result_t<Func, MetaStore, Arg &&>::element_type::RspT>;
template <typename Req, typename Rsp>
std::unique_ptr<BatchedOp> addBatchReq(InodeId inodeId, BatchedOp::Waiter<Req, Rsp> &waiter) {
auto func = [&](auto &map) {
auto [iter, inserted] = map.try_emplace(inodeId);
auto &batch = iter->second;
if (inserted) {
assert(!batch.getNext());
auto op = std::make_unique<BatchedOp>(*metaStore_, inodeId);
op->add(waiter);
waiter.baton.post();
return op;
} else if (!batch.getNext()) {
auto op = std::make_unique<BatchedOp>(*metaStore_, inodeId);
op->add(waiter);
batch.setNext(op.get(), &waiter.baton);
return op;
} else {
auto next = batch.getNext();
auto num_reqs = next->numReqs();
if (UNLIKELY(config_.max_batch_operations() != 0 && num_reqs >= config_.max_batch_operations())) {
auto msg = fmt::format("too many batch operations on {}", inodeId);
XLOG(WARN, msg);
waiter.result = makeError(MetaCode::kBusy, std::move(msg));
waiter.baton.post();
} else {
if (num_reqs && num_reqs % 1024 == 0) {
XLOGF(WARN, "{} batch operations on {}", num_reqs, inodeId);
}
next->add(waiter);
}
return std::unique_ptr<BatchedOp>();
}
};
return batches_.withLock(func, inodeId);
}
CoTryTask<Inode> runBatch(InodeId inodeId,
std::unique_ptr<BatchedOp> op,
std::optional<SteadyTime> deadline = std::nullopt);
template <typename Req, typename Rsp>
CoTryTask<Rsp> runInBatch(InodeId inodeId, Req req);
CoTryTask<void> authenticate(UserInfo &userInfo);
const Config &config_;
flat::NodeId nodeId_;
analytics::StructuredTraceLog<MetaEventTrace> metaEventTraceLog_;
std::shared_ptr<kv::IKVEngine> kvEngine_;
std::shared_ptr<client::ICommonMgmtdClient> mgmtd_;
std::shared_ptr<Distributor> distributor_;
std::shared_ptr<core::UserStoreEx> userStore_;
std::shared_ptr<InodeIdAllocator> inodeIdAlloc_;
std::shared_ptr<ChainAllocator> chainAlloc_;
std::shared_ptr<FileHelper> fileHelper_;
std::shared_ptr<SessionManager> sessionManager_;
std::shared_ptr<GcManager> gcManager_;
std::unique_ptr<Forward> forward_;
std::unique_ptr<MetaStore> metaStore_;
Shards<std::map<InodeId, Batch>, 63> batches_;
std::atomic_bool stop_{false};
std::unique_ptr<BackgroundRunner> bgRunner_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,45 @@
#pragma once
#include "common/serde/CallContext.h"
#include "fbs/meta/Service.h"
#include "meta/service/MetaOperator.h"
namespace hf3fs::meta::server {
class MetaSerdeService : public serde::ServiceWrapper<MetaSerdeService, MetaSerde> {
public:
MetaSerdeService(MetaOperator &meta)
: meta_(meta) {}
#define META_SERVICE_METHOD(NAME, REQ, RESP) \
CoTryTask<RESP> NAME(serde::CallContext &, const REQ &req) { return meta_.NAME(req); }
META_SERVICE_METHOD(statFs, StatFsReq, StatFsRsp);
META_SERVICE_METHOD(stat, StatReq, StatRsp);
META_SERVICE_METHOD(create, CreateReq, CreateRsp);
META_SERVICE_METHOD(mkdirs, MkdirsReq, MkdirsRsp);
META_SERVICE_METHOD(symlink, SymlinkReq, SymlinkRsp);
META_SERVICE_METHOD(hardLink, HardLinkReq, HardLinkRsp);
META_SERVICE_METHOD(remove, RemoveReq, RemoveRsp);
META_SERVICE_METHOD(open, OpenReq, OpenRsp);
META_SERVICE_METHOD(sync, SyncReq, SyncRsp);
META_SERVICE_METHOD(close, CloseReq, CloseRsp);
META_SERVICE_METHOD(rename, RenameReq, RenameRsp);
META_SERVICE_METHOD(list, ListReq, ListRsp);
META_SERVICE_METHOD(truncate, TruncateReq, TruncateRsp);
META_SERVICE_METHOD(getRealPath, GetRealPathReq, GetRealPathRsp);
META_SERVICE_METHOD(setAttr, SetAttrReq, SetAttrRsp);
META_SERVICE_METHOD(pruneSession, PruneSessionReq, PruneSessionRsp);
META_SERVICE_METHOD(dropUserCache, DropUserCacheReq, DropUserCacheRsp);
META_SERVICE_METHOD(authenticate, AuthReq, AuthRsp);
META_SERVICE_METHOD(lockDirectory, LockDirectoryReq, LockDirectoryRsp);
META_SERVICE_METHOD(testRpc, TestRpcReq, TestRpcRsp);
META_SERVICE_METHOD(batchStat, BatchStatReq, BatchStatRsp);
META_SERVICE_METHOD(batchStatByPath, BatchStatByPathReq, BatchStatByPathRsp);
#undef META_SERVICE_METHOD
private:
MetaOperator &meta_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,111 @@
#include "meta/service/MetaServer.h"
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include "common/app/ApplicationBase.h"
#include "common/utils/Result.h"
#include "core/service/CoreService.h"
#include "fdb/HybridKvEngine.h"
#include "meta/components/ChainAllocator.h"
#include "meta/service/MetaOperator.h"
#include "meta/service/MetaSerdeService.h"
#include "stubs/common/RealStubFactory.h"
#include "stubs/mgmtd/MgmtdServiceStub.h"
namespace hf3fs::meta::server {
MetaServer::MetaServer(const MetaServer::Config &config)
: net::Server(config.base()),
config_(config) {}
MetaServer::~MetaServer() { XLOGF(INFO, "Destructor MetaServer"); }
Result<Void> MetaServer::beforeStart() {
if (!backgroundClient_) {
backgroundClient_ = std::make_unique<net::Client>(config_.background_client());
RETURN_ON_ERROR(backgroundClient_->start());
}
if (!mgmtdClient_) {
auto ctxCreator = [this](net::Address addr) { return backgroundClient_->serdeCtx(addr); };
mgmtdClient_ = std::make_shared<::hf3fs::client::MgmtdClientForServer>(
appInfo().clusterId,
std::make_unique<stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(std::move(ctxCreator)),
config_.mgmtd_client());
}
mgmtdClient_->setAppInfoForHeartbeat(appInfo());
mgmtdClient_->setConfigListener(ApplicationBase::updateConfig);
mgmtdClient_->updateHeartbeatPayload(flat::MetaHeartbeatInfo{});
folly::coro::blockingWait(mgmtdClient_->start(&tpg().bgThreadPool().randomPick()));
auto mgmtdClientRefreshRes = folly::coro::blockingWait(mgmtdClient_->refreshRoutingInfo(/*force=*/false));
XLOGF_IF(FATAL, !mgmtdClientRefreshRes, "Failed to refresh initial routing info!");
// init service groups.
if (!kvEngine_) {
kvEngine_ = kv::HybridKvEngine::from(config_.kv_engine(), config_.use_memkv(), config_.fdb());
}
auto storageClient = storage::client::StorageClient::create(ClientId::random(appInfo().hostname),
config_.storage_client(),
*mgmtdClient_);
XLOGF_IF(FATAL, !storageClient, "Failed to create storage client!");
auto &appInfo = this->appInfo();
XLOGF_IF(FATAL, !appInfo.nodeId, "Invalid nodeId {}", appInfo.nodeId);
metaOperator_ = std::make_unique<MetaOperator>(
config_.meta(),
appInfo.nodeId,
kvEngine_,
mgmtdClient_,
storageClient,
std::make_unique<Forward>(config_.meta().forward(), appInfo.nodeId, *backgroundClient_, mgmtdClient_));
RETURN_ON_ERROR(addSerdeService(std::make_unique<MetaSerdeService>(*metaOperator_), true));
RETURN_ON_ERROR(addSerdeService(std::make_unique<core::CoreService>()));
// init MetaOperator.
std::optional<Layout> rootLayout;
if (config_.use_memkv()) {
rootLayout = Layout::newEmpty(ChainTableId(1), 512 << 10, 1);
}
auto result = folly::coro::blockingWait(metaOperator_->init(rootLayout));
if (UNLIKELY(!result)) {
XLOGF(ERR, "Init MetaOperator failed with {}", result.error().describe());
RETURN_ON_ERROR(result);
}
metaOperator_->start(tpg().bgThreadPool());
return Void{};
}
Result<Void> MetaServer::beforeStop() {
metaOperator_->beforeStop();
if (mgmtdClient_) {
folly::coro::blockingWait(mgmtdClient_->stop());
}
return Void{};
}
Result<Void> MetaServer::afterStop() {
metaOperator_->afterStop();
if (backgroundClient_) {
backgroundClient_->stopAndJoin();
}
return Void{};
}
Result<Void> MetaServer::start(const flat::AppInfo &info, std::shared_ptr<kv::IKVEngine> kvEngine) {
kvEngine_ = std::move(kvEngine);
return net::Server::start(info);
}
Result<Void> MetaServer::start(const flat::AppInfo &info,
std::unique_ptr<net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient) {
backgroundClient_ = std::move(client);
mgmtdClient_ = std::make_shared<::hf3fs::client::MgmtdClientForServer>(std::move(mgmtdClient));
return net::Server::start(info);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,96 @@
#pragma once
#include <memory>
#include "client/mgmtd/MgmtdClientForServer.h"
#include "client/storage/StorageClient.h"
#include "common/logging/LogConfig.h"
#include "common/net/Client.h"
#include "common/net/Server.h"
#include "common/utils/BackgroundRunner.h"
#include "common/utils/ConfigBase.h"
#include "core/app/ServerAppConfig.h"
#include "core/app/ServerLauncher.h"
#include "core/app/ServerLauncherConfig.h"
#include "core/app/ServerMgmtdClientFetcher.h"
#include "fdb/HybridKvEngineConfig.h"
#include "meta/base/Config.h"
#include "meta/service/MetaOperator.h"
namespace hf3fs::meta::server {
class MetaServer : public net::Server {
public:
static constexpr auto kName = "Meta";
static constexpr auto kNodeType = flat::NodeType::META;
struct CommonConfig : public ApplicationBase::Config {
CommonConfig() {
using logging::LogConfig;
log().set_categories({LogConfig::makeRootCategoryConfig(), LogConfig::makeEventCategoryConfig()});
log().set_handlers({LogConfig::makeNormalHandlerConfig(),
LogConfig::makeErrHandlerConfig(),
LogConfig::makeFatalHandlerConfig(),
LogConfig::makeEventHandlerConfig()});
}
};
using AppConfig = core::ServerAppConfig;
struct LauncherConfig : public core::ServerLauncherConfig {
LauncherConfig() { mgmtd_client() = hf3fs::client::MgmtdClientForServer::Config{}; }
};
using RemoteConfigFetcher = core::launcher::ServerMgmtdClientFetcher;
using Launcher = core::ServerLauncher<MetaServer>;
struct Config : public ConfigBase<Config> {
CONFIG_ITEM(use_memkv, false); // deprecated
CONFIG_OBJ(base, net::Server::Config, [](net::Server::Config &c) {
c.set_groups_length(2);
c.groups(0).listener().set_listen_port(8000);
c.groups(0).set_services({"MetaSerde"});
c.groups(1).set_network_type(net::Address::TCP);
c.groups(1).listener().set_listen_port(9000);
c.groups(1).set_use_independent_thread_pool(true);
c.groups(1).set_services({"Core"});
});
CONFIG_OBJ(fdb, kv::fdb::FDBConfig); // deprecated
CONFIG_OBJ(meta, meta::server::Config);
CONFIG_OBJ(background_client, net::Client::Config);
CONFIG_OBJ(mgmtd_client, ::hf3fs::client::MgmtdClientForServer::Config);
CONFIG_OBJ(storage_client, storage::client::StorageClient::Config, [](storage::client::StorageClient::Config &cfg) {
cfg.retry().set_init_wait_time(2_s);
cfg.retry().set_max_wait_time(5_s);
cfg.retry().set_max_retry_time(5_s);
cfg.retry().set_max_failures_before_failover(1);
});
CONFIG_OBJ(kv_engine, kv::HybridKvEngineConfig);
};
MetaServer(const Config &config);
~MetaServer() override;
using net::Server::start;
Result<Void> start(const flat::AppInfo &info, std::shared_ptr<kv::IKVEngine> kvEngine);
Result<Void> start(const flat::AppInfo &info,
std::unique_ptr<net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient);
// set up meta server.
Result<Void> beforeStart() final;
// tear down meta server.
Result<Void> beforeStop() final;
Result<Void> afterStop() final;
private:
const Config &config_;
std::shared_ptr<kv::IKVEngine> kvEngine_;
std::unique_ptr<net::Client> backgroundClient_;
std::shared_ptr<::hf3fs::client::MgmtdClientForServer> mgmtdClient_;
std::unique_ptr<MetaOperator> metaOperator_;
};
} // namespace hf3fs::meta::server

108
src/meta/service/MockMeta.h Normal file
View File

@@ -0,0 +1,108 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <fmt/core.h>
#include <folly/Random.h>
#include <folly/Utility.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/logging/xlog.h>
#include <map>
#include <memory>
#include <unistd.h>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/storage/StorageClient.h"
#include "common/app/NodeId.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/mem/MemKVEngine.h"
#include "common/serde/ClientMockContext.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "meta/components/ChainAllocator.h"
#include "meta/components/FileHelper.h"
#include "meta/components/GcManager.h"
#include "meta/service/MetaOperator.h"
#include "meta/service/MetaSerdeService.h"
#include "meta/store/MetaStore.h"
namespace hf3fs::meta::server {
class MockMeta : folly::NonCopyableNonMovable {
public:
static CoTryTask<std::unique_ptr<MockMeta>> create(const Config &cfg,
std::shared_ptr<kv::IKVEngine> kv,
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient) {
auto meta = std::unique_ptr<MockMeta>(new MockMeta(cfg, kv, mgmtdClient));
for (auto &moperator : meta->operators_) {
CO_RETURN_ON_ERROR(co_await moperator->init(Layout::newEmpty(ChainTableId(1), 512 << 10, 128)));
}
co_return meta;
}
~MockMeta() { stop(); }
void start(CPUExecutorGroup &exec) {
for (auto &moperator : operators_) {
moperator->start(exec);
}
}
void stop() {
for (auto &moperator : operators_) {
moperator->beforeStop();
moperator->afterStop();
}
}
std::unique_ptr<MetaSerdeService> getService() { return std::make_unique<MetaSerdeService>(*operators_.at(0)); }
MetaOperator &getOperator() { return *operators_.at(0); }
MetaStore &getStore() { return dynamic_cast<MetaStore &>(*getOperator().metaStore_); }
storage::client::StorageClient &getStorageClient() { return *storageClient_; }
FileHelper &getFileHelper() { return *getOperator().fileHelper_; }
GcManager &getGcManager() { return *getOperator().gcManager_; }
SessionManager &getSessionManager() { return *getOperator().sessionManager_; }
private:
MockMeta(const Config &cfg,
std::shared_ptr<kv::IKVEngine> kv,
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient)
: cfg_(cfg),
mgmtdClient_(mgmtdClient) {
storageClientCfg_.set_implementation_type(storage::client::StorageClient::ImplementationType::InMem);
storageClient_ = storage::client::StorageClient::create(ClientId::random(), storageClientCfg_, *mgmtdClient_);
auto routing = mgmtdClient->getRoutingInfo();
XLOGF_IF(FATAL, !routing, "routing info not available");
auto nodes = routing->getNodeBy(flat::selectNodeByType(flat::NodeType::META) && flat::selectActiveNode());
XLOGF_IF(FATAL, nodes.empty(), "no active metas");
for (auto &node : nodes) {
XLOGF_IF(FATAL, contexts_.contains(node.app.nodeId), "duplicated {}", node.app.nodeId);
auto moperator = std::make_unique<MetaOperator>(
cfg,
node.app.nodeId,
kv,
mgmtdClient_,
storageClient_,
std::make_unique<Forward>(cfg.forward(), node.app.nodeId, contexts_, mgmtdClient_));
contexts_[node.app.nodeId] = serde::ClientMockContext::create(std::make_unique<MetaSerdeService>(*moperator));
operators_.push_back(std::move(moperator));
}
}
[[maybe_unused]] const Config &cfg_;
storage::client::StorageClient::Config storageClientCfg_;
std::vector<std::unique_ptr<MetaOperator>> operators_;
std::map<flat::NodeId, serde::ClientMockContext> contexts_;
std::shared_ptr<client::ICommonMgmtdClient> mgmtdClient_;
std::shared_ptr<storage::client::StorageClient> storageClient_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,112 @@
#pragma once
#include <cassert>
#include <folly/Likely.h>
#include <folly/Synchronized.h>
#include <folly/Utility.h>
#include <folly/experimental/coro/Baton.h>
#include <folly/futures/Future.h>
#include <folly/futures/Promise.h>
#include <folly/io/async/Request.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <utility>
#include <variant>
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
class BatchContext : public folly::RequestData {
public:
template <typename T>
struct SharedFuture : folly::NonCopyableNonMovable {
Result<T> value = makeError(StatusCode::kUnknown);
folly::coro::Baton baton;
};
template <typename T>
struct LoadGuard {
bool needLoad;
std::shared_ptr<SharedFuture<T>> future;
LoadGuard(bool needLoad, std::shared_ptr<SharedFuture<T>> future)
: needLoad(needLoad),
future(future) {}
~LoadGuard() {
if (needLoad && !future->baton.ready()) {
future->value = makeError(StatusCode::kUnknown, "load failed in BatchContext");
future->baton.post();
}
}
void set(const Result<T> &r) {
assert(!future->baton.ready());
future->value = r;
future->baton.post();
}
CoTryTask<T> coAwait() {
co_await future->baton;
co_return future->value;
}
};
static folly::ShallowCopyRequestContextScopeGuard create() {
return folly::ShallowCopyRequestContextScopeGuard{token(), std::make_unique<BatchContext>()};
}
static inline BatchContext *get() {
auto requestContext = folly::RequestContext::try_get();
if (LIKELY(requestContext == nullptr)) {
return nullptr;
}
return dynamic_cast<BatchContext *>(requestContext->getContextData(token()));
}
LoadGuard<std::optional<Inode>> loadInode(InodeId inodeId) {
return loadImpl<InodeId, std::optional<Inode>>(inodes_, inodeId);
}
LoadGuard<std::optional<DirEntry>> loadDirEntry(InodeId parent, std::string name) {
return loadImpl<std::pair<InodeId, std::string>, std::optional<DirEntry>>(entries_, {parent, std::move(name)});
}
bool hasCallback() override { return false; }
private:
static constexpr const char *kTokenName = "hf3fs::meta::server::BatchContext";
static folly::RequestToken const &token() {
static folly::RequestToken const token(kTokenName);
return token;
}
template <typename K, typename T>
using SynchronizedFutureMap = folly::Synchronized<std::map<K, std::shared_ptr<SharedFuture<T>>>, std::mutex>;
template <typename K, typename T>
LoadGuard<T> loadImpl(SynchronizedFutureMap<K, T> &map, K key) {
auto guard = map.lock();
auto iter = guard->find(key);
if (iter != guard->end()) {
return LoadGuard<T>(false, iter->second);
}
auto future = std::make_shared<SharedFuture<T>>();
guard->emplace(std::move(key), future);
return LoadGuard<T>(true, future);
}
SynchronizedFutureMap<InodeId, std::optional<Inode>> inodes_;
SynchronizedFutureMap<std::pair<InodeId, std::string>, std::optional<DirEntry>> entries_;
};
} // namespace hf3fs::meta::server

327
src/meta/store/DirEntry.cc Normal file
View File

@@ -0,0 +1,327 @@
#include "meta/store/DirEntry.h"
#include <algorithm>
#include <boost/core/ignore_unused.hpp>
#include <cassert>
#include <fmt/core.h>
#include <fmt/format.h>
#include <folly/Likely.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/functional/Partial.h>
#include <folly/futures/Future.h>
#include <folly/logging/xlog.h>
#include <linux/limits.h>
#include <optional>
#include <string_view>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/monitor/Recorder.h"
#include "common/serde/Serde.h"
#include "common/utils/Coroutine.h"
#include "common/utils/MagicEnum.hpp"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "meta/store/BatchContext.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
namespace {
constexpr auto prefix = kv::KeyPrefix::Dentry;
bool checkName(std::string_view name) {
return !name.empty() && name != "." && name != ".." && std::find(name.begin(), name.end(), '/') == name.end();
}
} // namespace
/** DirEntry */
std::string DirEntry::packKey(InodeId parent, std::string_view name) {
String buf;
buf.reserve(sizeof(prefix) + sizeof(InodeId::Key) + name.size());
Serializer ser{buf};
ser.put(prefix);
ser.put(parent.packKey());
ser.putRaw(name.data(), name.size());
return buf;
}
std::string DirEntry::packKey() const { return packKey(parent, name); }
Result<Void> DirEntry::unpackKey(const std::string_view key) {
// todo: log more data
Deserializer des(key);
auto p = des.get<kv::KeyPrefix>();
RETURN_ON_ERROR(p);
assert(p.value() == prefix);
auto parentKey = des.get<InodeId::Key>();
RETURN_ON_ERROR(parentKey);
parent = InodeId::unpackKey(parentKey.value());
auto nameRes = des.getRawUntilEnd();
RETURN_ON_ERROR(nameRes);
name = *nameRes;
return Void();
}
Result<DirEntry> DirEntry::newUnpacked(const std::string_view key, const std::string_view value) {
DirEntry entry;
if (auto result = entry.unpackKey(key); result.hasError()) {
auto formattedKey =
fmt::format("{:02x}", fmt::join((uint8_t *)key.data(), (uint8_t *)key.data() + key.length(), ","));
XLOGF(CRITICAL,
"Failed to deserialize dirEntry key {}, error {}, data corruption!!!",
formattedKey,
result.error());
return makeError(StatusCode::kDataCorruption, fmt::format("deserialize dirEntry key {} failed", formattedKey));
}
if (auto des = serde::deserialize(entry.data(), value); des.hasError()) {
XLOGF(CRITICAL,
"Failed to deserialize dirEntry {}/{}, {}, data corruption!!!",
entry.parent,
entry.name,
des.error());
return makeError(StatusCode::kDataCorruption);
}
return std::move(entry);
}
template <const bool SNAPSHOT>
CoTryTask<std::optional<DirEntry>> DirEntry::loadImpl(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view name) {
auto func = SNAPSHOT ? &IReadOnlyTransaction::snapshotGet : &IReadOnlyTransaction::get;
if (name.size() > NAME_MAX) {
XLOGF(DBG, "length of name {} > {}", name, NAME_MAX);
co_return makeError(MetaCode::kNameTooLong, fmt::format("{} > {}", name, NAME_MAX));
}
auto result = co_await (txn.*func)(packKey(parent, name));
if (result.hasError()) {
XLOGF(ERR, "Failed to load dirEntry {}/{}, error {}", parent, name, result.error());
CO_RETURN_ERROR(result);
}
if (auto &value = *result; value.has_value()) {
DirEntry entry(parent, std::string(name), {});
#ifndef NDEBUG
entry.snapshotLoaded_ = SNAPSHOT;
#endif
if (auto des = serde::deserialize(entry.data(), *value); des.hasError()) {
XLOGF(CRITICAL, "Failed to deserialize dirEntry {}/{}, {}, data corruption!!", parent, name, des.error());
co_return makeError(StatusCode::kDataCorruption);
}
co_return std::move(entry);
} else {
co_return std::nullopt;
}
}
CoTryTask<std::optional<DirEntry>> DirEntry::snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view name) {
if (auto batch = BatchContext::get(); batch) {
auto guard = batch->loadDirEntry(parent, std::string(name));
if (guard.needLoad) {
auto r = co_await loadImpl<true>(txn, parent, name);
guard.set(r);
co_return std::move(r);
} else {
co_return co_await guard.coAwait();
}
} else {
co_return co_await loadImpl<true>(txn, parent, name);
}
}
CoTryTask<std::optional<DirEntry>> DirEntry::load(IReadOnlyTransaction &txn, InodeId parent, std::string_view name) {
co_return co_await loadImpl<false>(txn, parent, name);
}
CoTryTask<void> DirEntry::store(IReadWriteTransaction &txn) const {
assert(!snapshotLoaded_);
if (UNLIKELY(!checkName(name))) {
XLOGF(DFATAL, "DirEntry name {} is invalid, should never happen!!!", name);
co_return makeError(MetaCode::kFoundBug, fmt::format("Invalid DirEntry name {}!", name));
}
if (UNLIKELY(name.size() > NAME_MAX)) {
co_return makeError(MetaCode::kNameTooLong, fmt::format("name {} len > {}", name, NAME_MAX));
}
if (UNLIKELY(id.isTreeRoot())) {
XLOGF(DFATAL, "DirEntry {} points to tree root, should never happen!!!", *this);
co_return makeError(MetaCode::kFoundBug, fmt::format("DirEntry {} points to tree root", *this));
}
auto key = packKey();
auto value = serde::serialize(data());
if (auto result = co_await txn.set(key, value); result.hasError()) {
XLOGF(ERR, "Failed to store dirEntry {}, error {}", *this, result.error());
co_return result;
}
XLOGF(DBG, "DirEntry store {}/{}, {}", parent, name, id);
co_return Void{};
}
CoTryTask<void> DirEntry::remove(IReadWriteTransaction &txn, bool ignoreSnapshotCheck) const {
assert(!snapshotLoaded_ || ignoreSnapshotCheck);
boost::ignore_unused(ignoreSnapshotCheck);
if (UNLIKELY(!checkName(name))) {
XLOGF(DFATAL, "DirEntry name {} is invalid, should never happen!!!", name);
co_return makeError(StatusCode::kInvalidArg, fmt::format("Invalid DirEntry name {}!", name));
}
XLOGF(DBG, "Remove direntry {}/{}", parent, name);
co_return co_await txn.clear(packKey());
}
static inline Result<Inode> checkInodeExists(const DirEntry &entry, std::optional<Inode> result) {
if (result.has_value()) {
return std::move(result.value());
} else if (entry.name == "." || entry.name == "..") {
// this is a fake dirEntry, so inode may not exists
XLOGF(DBG, "Inode of entry {} doesn't exist", entry);
return makeError(MetaCode::kNotFound);
}
auto msg = fmt::format("DirEntry {} exists, but Inode not found", entry);
XLOGF(CRITICAL, "Metadata inconsistent: {}!!!", msg);
return makeError(MetaCode::kInconsistent, std::move(msg));
}
static inline Result<Inode> checkInodeType(const DirEntry &entry, Inode inode) {
if (UNLIKELY(inode.getType() != entry.type)) {
auto msg = fmt::format("DirEntry {}/{} -> {} found, but InodeType mismatch {} != {}",
entry.parent,
entry.name,
entry.id,
magic_enum::enum_name(inode.getType()),
magic_enum::enum_name(entry.type));
XLOGF(CRITICAL, "Metadata inconsistent: {}!!!", msg);
return makeError(MetaCode::kInconsistent, std::move(msg));
}
return std::move(inode);
}
template <typename Txn, CoTryTask<std::optional<Inode>> (*LoadFunc)(Txn &, InodeId id)>
static CoTryTask<Inode> loadInodeFromDirEntry(Txn &txn, const DirEntry &entry) {
co_return (co_await (*LoadFunc)(txn, entry.id))
.then(folly::partial(checkInodeExists, entry))
.then(folly::partial(checkInodeType, entry));
}
CoTryTask<Inode> DirEntry::loadInode(IReadOnlyTransaction &txn) const {
co_return co_await loadInodeFromDirEntry<IReadOnlyTransaction, &Inode::load>(txn, *this);
}
CoTryTask<Inode> DirEntry::snapshotLoadInode(IReadOnlyTransaction &txn) const {
co_return co_await loadInodeFromDirEntry<IReadOnlyTransaction, &Inode::snapshotLoad>(txn, *this);
}
/** DirEntryList */
template <const bool SNAPSHOT>
CoTryTask<DirEntryList> DirEntryList::loadImpl(IReadOnlyTransaction &txn,
InodeId parent,
IReadOnlyTransaction::KeySelector begin,
IReadOnlyTransaction::KeySelector end,
int32_t limit,
bool loadInodes,
size_t loadInodesConcurrent) {
auto func = SNAPSHOT ? &IReadOnlyTransaction::snapshotGetRange : &IReadOnlyTransaction::getRange;
auto result = co_await (txn.*func)(begin, end, limit > 0 ? limit : 128);
CO_RETURN_ON_ERROR(result);
bool more = result->hasMore;
std::vector<DirEntry> entries;
for (auto &kv : result->kvs) {
const auto &[key, value] = kv.pair();
auto entry = DirEntry::newUnpacked(key, value);
CO_RETURN_ON_ERROR(entry);
XLOGF_IF(FATAL, entry->parent != parent, "DirEntryList::load {}, get entry {}", parent, *entry);
#ifndef NDEBUG
entry->snapshotLoaded_ = SNAPSHOT;
#endif
entries.push_back(std::move(entry.value()));
}
if (!loadInodes) {
co_return DirEntryList{std::move(entries), {}, more};
}
if (loadInodesConcurrent <= 0) {
loadInodesConcurrent = 8;
}
auto exec = co_await folly::coro::co_current_executor;
std::vector<Inode> inodes;
auto iter = entries.begin();
while (iter != entries.end()) {
std::vector<folly::SemiFuture<Result<Inode>>> tasks;
while (iter != entries.end() && tasks.size() < loadInodesConcurrent) {
auto &entry = *iter;
auto func = SNAPSHOT ? &DirEntry::snapshotLoadInode : &DirEntry::loadInode;
tasks.push_back((entry.*func)(txn).scheduleOn(exec).start());
iter++;
}
auto results = co_await folly::coro::collectAllRange(std::move(tasks));
for (auto result : results) {
XLOGF_IF(INFO, result.hasError(), "here error {}", result.error());
CO_RETURN_ON_ERROR(result);
inodes.push_back(*result);
}
}
co_return DirEntryList{std::move(entries), std::move(inodes), more};
}
CoTryTask<DirEntryList> DirEntryList::snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view prev,
int32_t limit,
bool loadInodes,
size_t loadInodesConcurrent) {
std::string beginKey = DirEntry::packKey(parent, prev);
std::string prefix = DirEntry::packKey(parent, "");
std::string endKey = kv::TransactionHelper::prefixListEndKey(prefix);
IReadOnlyTransaction::KeySelector begin{beginKey, false};
IReadOnlyTransaction::KeySelector end{endKey, false};
co_return co_await loadImpl<true>(txn, parent, begin, end, limit, loadInodes, loadInodesConcurrent);
}
CoTryTask<DirEntryList> DirEntryList::snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view begin,
std::string_view end,
int32_t limit,
bool loadInodes,
size_t loadInodesConcurrent) {
std::string beginKey = DirEntry::packKey(parent, begin);
std::string endKey = DirEntry::packKey(parent, end);
IReadOnlyTransaction::KeySelector selBegin{beginKey, false};
IReadOnlyTransaction::KeySelector selEnd{endKey, false};
co_return co_await loadImpl<false>(txn, parent, selBegin, selEnd, limit, loadInodes, loadInodesConcurrent);
}
CoTryTask<DirEntryList> DirEntryList::load(IReadWriteTransaction &txn,
InodeId parent,
std::string_view prev,
int32_t limit) {
std::string beginKey = DirEntry::packKey(parent, prev);
std::string prefix = DirEntry::packKey(parent, "");
std::string endKey = kv::TransactionHelper::prefixListEndKey(prefix);
IReadOnlyTransaction::KeySelector begin{beginKey, false};
IReadOnlyTransaction::KeySelector end{endKey, false};
co_return co_await loadImpl<false>(txn, parent, begin, end, limit, false, 8);
}
CoTryTask<bool> DirEntryList::checkEmpty(IReadWriteTransaction &txn, InodeId parent) {
auto prefix = DirEntry::packKey(parent, "");
auto endKey = kv::TransactionHelper::prefixListEndKey(prefix);
IReadWriteTransaction::KeySelector begin(prefix, false);
IReadWriteTransaction::KeySelector end(endKey, false);
auto result = co_await txn.getRange(begin, end, 1);
CO_RETURN_ON_ERROR(result);
co_return result->kvs.empty() && !result->hasMore;
}
} // namespace hf3fs::meta::server

214
src/meta/store/DirEntry.h Normal file
View File

@@ -0,0 +1,214 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <ctime>
#include <fmt/core.h>
#include <folly/Expected.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include <queue>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/SerDeser.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fbs/meta/Service.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
using hf3fs::kv::IReadOnlyTransaction;
using hf3fs::kv::IReadWriteTransaction;
struct DirEntryList;
class DirEntry : public meta::DirEntry {
public:
using Base = meta::DirEntry;
using Base::Base;
DirEntry(Base base)
: Base(std::move(base)) {}
static Result<DirEntry> newUnpacked(const std::string_view key, const std::string_view value);
static CoTryTask<std::optional<DirEntry>> snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view name);
static CoTryTask<std::optional<DirEntry>> load(IReadOnlyTransaction &txn, InodeId parent, std::string_view name);
static CoTryTask<bool> checkExist(IReadOnlyTransaction &txn, InodeId parent, std::string_view name) {
co_return (co_await DirEntry::load(txn, parent, name)).then([](auto &v) { return v.has_value(); });
}
static DirEntry newFile(InodeId parent, std::string name, InodeId inode) {
return meta::DirEntry(parent, name, {inode, InodeType::File});
}
static DirEntry newSymlink(InodeId parent, std::string name, InodeId inode) {
return meta::DirEntry(parent, name, {inode, InodeType::Symlink});
}
static DirEntry newDirectory(InodeId parent, std::string name, InodeId inode, Acl acl) {
return meta::DirEntry(parent, name, {inode, InodeType::Directory, acl});
}
static DirEntry root() {
return meta::DirEntry(InodeId::root(), ".", {InodeId::root(), InodeType::Directory, Acl::root()});
}
/** Key format: prefix + parent-InodeId.key + name */
std::string packKey() const;
static std::string packKey(InodeId parent, std::string_view name);
Result<Void> unpackKey(const std::string_view key);
// load inode from dir entry
CoTryTask<Inode> loadInode(IReadOnlyTransaction &txn) const;
CoTryTask<Inode> snapshotLoadInode(IReadOnlyTransaction &txn) const;
CoTryTask<void> addIntoReadConflict(IReadWriteTransaction &txn) const {
#ifndef NDEBUG
snapshotLoaded_ = false;
#endif
co_return co_await txn.addReadConflict(packKey());
}
CoTryTask<void> store(IReadWriteTransaction &txn) const;
CoTryTask<void> remove(IReadWriteTransaction &txn, bool ignoreSnapshotCheck = false) const;
private:
friend struct DirEntryList;
friend class MetaTestHelper;
template <const bool SNAPSHOT>
static CoTryTask<std::optional<DirEntry>> loadImpl(IReadOnlyTransaction &txn, InodeId parent, std::string_view name);
#ifndef NDEBUG
mutable bool snapshotLoaded_ = false;
#endif
};
struct DirEntryList {
std::vector<DirEntry> entries;
std::vector<Inode> inodes;
bool more;
// (prev, end)
static CoTryTask<DirEntryList> snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view prev,
int32_t limit,
bool loadInodes = false,
size_t loadInodesConcurrent = 0);
// (begin, end)
static CoTryTask<DirEntryList> snapshotLoad(IReadOnlyTransaction &txn,
InodeId parent,
std::string_view begin,
std::string_view end,
int32_t limit,
bool loadInodes = false,
size_t loadInodesConcurrent = 0);
static CoTryTask<DirEntryList> load(IReadWriteTransaction &txn, InodeId parent, std::string_view prev, int32_t limit);
static CoTryTask<bool> checkEmpty(IReadWriteTransaction &txn, InodeId parent);
// For recursive remove and move to the trash, permission checks are required.
// However, because the directory may be very large, we may not able to check permissions for entire
// directory tree. This method is best effort.
static CoTryTask<Void> recursiveCheckRmPerm(IReadWriteTransaction &txn,
InodeId parent,
flat::UserInfo user,
int32_t limit,
size_t listBatchSize) {
static monitor::CountRecorder failed("meta_server.recursive_check_rm_perm_failed");
auto guard = folly::makeGuard([&]() {
failed.addSample(1, {{"uid", folly::to<std::string>(user.uid.toUnderType())}});
});
auto queue = std::queue<InodeId>();
queue.push(parent);
while (!queue.empty()) {
auto currDir = queue.front();
queue.pop();
auto prev = std::string();
auto foundDir = false;
auto numEntries = 0;
while (true) {
if (limit-- <= 0) {
break;
}
auto list = co_await DirEntryList::snapshotLoad(txn, currDir, prev, std::max(listBatchSize, 32ul));
CO_RETURN_ON_ERROR(list);
numEntries += list->entries.size();
for (auto &entry : list->entries) {
prev = entry.name;
if (!entry.isDirectory()) {
continue;
}
foundDir = true;
if ((int64_t)queue.size() < limit) {
queue.push(entry.id);
}
auto &acl = *entry.dirAcl;
if (auto res = acl.checkRecursiveRmPerm(user, false); res.hasError()) {
auto msg = fmt::format("user {} recursive remove {}, found {} without permission, msg {}",
user.uid,
parent,
entry,
res.error().message());
XLOG(ERR, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
}
if (!list->more || (numEntries > 1024 && !foundDir)) {
break;
}
}
}
guard.dismiss();
co_return Void{};
}
DirEntry &entry(size_t i) { return entries.at(i); }
const DirEntry &entry(size_t i) const { return entries.at(i); }
const Inode &inode(size_t i) const { return inodes.at(i); }
Inode &inode(size_t i) { return inodes.at(i); }
operator ListRsp() && {
ListRsp rsp;
rsp.more = more;
rsp.entries.reserve(entries.size());
rsp.inodes.reserve(inodes.size());
for (auto &entry : entries) {
rsp.entries.emplace_back(std::move(entry));
}
for (auto &inode : inodes) {
rsp.inodes.emplace_back(std::move(inode));
}
return rsp;
}
private:
template <const bool SNAPSHOT>
static CoTryTask<DirEntryList> loadImpl(IReadOnlyTransaction &txn,
InodeId parent,
IReadOnlyTransaction::KeySelector begin,
IReadOnlyTransaction::KeySelector end,
int32_t limit,
bool loadInodes,
size_t loadInodesConcurrent);
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,280 @@
#include "meta/store/FileSession.h"
#include <algorithm>
#include <optional>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/serde/Serde.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/SerDeser.h"
#include "fbs/meta/Common.h"
// todo: move ito meta/utils
#define FMT_KEY(key) fmt::join((const uint8_t *)key.data(), (const uint8_t *)key.data() + key.size(), ",")
namespace hf3fs::meta::server {
namespace {
struct SessionByInode {
static constexpr auto keyPrefix = kv::KeyPrefix::InodeSession;
static std::string prefixOf(const InodeId inodeId) { return Serializer::serRawArgs(keyPrefix, inodeId.packKey()); }
static std::string packKey(InodeId inodeId, Uuid sessionId) {
return Serializer::serRawArgs(keyPrefix, inodeId.packKey(), sessionId);
}
static std::pair<InodeId, Uuid> unpackKey(std::string_view key) {
kv::KeyPrefix prefix;
InodeId::Key inodeId;
Uuid sessionId;
auto result = Deserializer::deserRawArgs(key, prefix, inodeId, sessionId);
XLOGF_IF(DFATAL, result.hasError(), "Failed to unpack key {:02x}, err {}", FMT_KEY(key), result.error());
XLOGF_IF(DFATAL,
prefix != keyPrefix,
"SessionByInode prefix not match {} != {}",
(uint32_t)prefix,
(uint32_t)keyPrefix);
return {InodeId::unpackKey(inodeId), sessionId};
}
static Result<FileSession> unpack(std::string_view key, std::string_view value) {
auto [inodeId, sessionId] = unpackKey(key);
FileSession session;
RETURN_ON_ERROR(serde::deserialize(session, value));
if (session.inodeId != inodeId || session.sessionId != sessionId) {
XLOGF(DFATAL,
"SessionByInode KV not match, key {} -> {} {}, value {}",
FMT_KEY(key),
inodeId,
sessionId,
session);
return makeError(StatusCode::kDataCorruption);
}
return session;
}
};
// struct SessionByClient {
// static constexpr auto keyPrefix = kv::KeyPrefix::ClientSession;
// static std::string prefixOf(const Uuid &clientId) { return Serializer::serRawArgs(keyPrefix, clientId); }
// static std::string packKey(const Uuid &clientId, const Uuid &sessionId) {
// return Serializer::serRawArgs(keyPrefix, clientId, sessionId);
// }
// static std::pair<Uuid, Uuid> unpackKey(std::string_view key) {
// kv::KeyPrefix prefix;
// Uuid clientId;
// Uuid sessionId;
// auto result = Deserializer::deserRawArgs(key, prefix, clientId, sessionId);
// XLOGF_IF(DFATAL, result.hasError(), "Failed to unpack key {:02x}, err {}", FMT_KEY(key), result.error());
// XLOGF_IF(DFATAL,
// prefix != keyPrefix,
// "SessionByInode prefix not match {} != {}",
// (uint32_t)prefix,
// (uint32_t)keyPrefix);
// return {clientId, sessionId};
// }
// static Result<FileSession> unpack(std::string_view key, std::string_view value) {
// auto [clientId, sessionId] = unpackKey(key);
// FileSession session;
// RETURN_ON_ERROR(serde::deserialize(session, value));
// if (session.clientId.uuid != clientId || session.sessionId != sessionId) {
// XLOGF(DFATAL,
// "SessionByClient KV not match, key {} -> {} {}, value {}",
// FMT_KEY(key),
// clientId,
// sessionId,
// session);
// return makeError(StatusCode::kDataCorruption);
// }
// return session;
// }
// };
template <typename SessionType, typename Id>
CoTryTask<std::vector<FileSession>> listSessions(IReadOnlyTransaction &txn, Id id, bool snapshot, size_t limit) {
auto prefix = SessionType::prefixOf(id);
auto unpack = SessionType::unpack;
auto options = kv::TransactionHelper::ListByPrefixOptions().withSnapshot(snapshot).withLimit(limit);
co_return co_await kv::TransactionHelper::listByPrefix<FileSession>(txn, prefix, options, unpack);
}
template <typename SessionType, typename Id>
CoTryTask<std::optional<FileSession>> loadSession(IReadOnlyTransaction &txn, Id id, Uuid sessionId) {
XLOGF(DBG, "Load session {}, {}", id, sessionId);
auto key = SessionType::packKey(id, sessionId);
auto value = co_await txn.get(key);
CO_RETURN_ON_ERROR(value);
if (value->has_value()) {
auto result = SessionType::unpack(key, **value);
CO_RETURN_ON_ERROR(result);
XLOGF(DBG, "Load session found {}", *result);
co_return result;
}
co_return std::nullopt;
}
} // namespace
/** FileSession */
std::string FileSession::prefix(InodeId inodeId) { return SessionByInode::prefixOf(inodeId); }
std::string FileSession::packKey(InodeId inodeId, Uuid sessionId) {
return SessionByInode::packKey(inodeId, sessionId);
}
Result<std::pair<InodeId, Uuid>> FileSession::unpackByInodeKey(std::string_view key) {
return SessionByInode::unpackKey(key);
}
Result<FileSession> FileSession::unpack(std::string_view key, std::string_view value) {
FileSession session;
auto result = serde::deserialize(session, value);
if (result.hasError()) {
XLOGF(DFATAL,
"FileSession unpack failed, key {}, value {}, error {}",
FMT_KEY(key),
FMT_KEY(value),
result.error());
RETURN_ERROR(result);
}
session.payload = "";
return session;
}
CoTryTask<std::optional<FileSession>> FileSession::load(IReadOnlyTransaction &txn, InodeId inodeId, Uuid session) {
co_return co_await loadSession<SessionByInode, InodeId>(txn, inodeId, session);
}
CoTryTask<std::vector<FileSession>> FileSession::list(IReadOnlyTransaction &txn,
InodeId inodeId,
bool snapshot,
size_t limit) {
co_return co_await listSessions<SessionByInode>(txn, inodeId, snapshot, limit);
}
CoTryTask<std::optional<FileSession>> FileSession::checkExists(IReadWriteTransaction &txn, const InodeId inodeId) {
auto exists = co_await snapshotCheckExists(dynamic_cast<IReadOnlyTransaction &>(txn), inodeId);
CO_RETURN_ON_ERROR(exists);
if (!exists) {
// NOTE: add range into read conflict set
auto prefix = SessionByInode::prefixOf(inodeId);
auto end = kv::TransactionHelper::prefixListEndKey(prefix);
CO_RETURN_ON_ERROR(co_await txn.addReadConflictRange(kv::TransactionHelper::keyAfter(prefix), end));
}
co_return exists;
}
CoTryTask<std::optional<FileSession>> FileSession::snapshotCheckExists(IReadOnlyTransaction &txn,
const InodeId inodeId) {
auto prefix = SessionByInode::prefixOf(inodeId);
auto end = kv::TransactionHelper::prefixListEndKey(prefix);
auto result = co_await txn.snapshotGetRange({prefix, false}, {end, false}, 1);
CO_RETURN_ON_ERROR(result);
XLOGF(DBG, "Check session for inodeId {}, cnt {} hasMore {}", inodeId, result->kvs.size(), result->hasMore);
while (result->kvs.empty() && result->hasMore) {
result = co_await txn.snapshotGetRange({prefix, false}, {end, false}, 1);
CO_RETURN_ON_ERROR(result);
XLOGF(DBG, "Check session for inodeId {}, cnt {} hasMore {}", inodeId, result->kvs.size(), result->hasMore);
}
if (!result->kvs.empty()) {
auto &kv = result->kvs.at(0);
co_return SessionByInode::unpack(kv.key, kv.value);
}
co_return std::nullopt;
}
CoTryTask<void> FileSession::removeAll(IReadWriteTransaction &txn, InodeId inodeId) {
/* todo: may be can't remove all sessions in 1 transactions */
XLOGF(DBG, "SessionManager remove all sessions for {}", inodeId);
auto sessions = co_await list(txn, inodeId, false);
CO_RETURN_ON_ERROR(sessions);
for (const auto &session : *sessions) {
CO_RETURN_ON_ERROR(co_await session.remove(txn));
}
XLOGF_IF(DBG, !sessions->empty(), "SessionManager remove {} sessions of inodeId {}.", sessions->size(), inodeId);
co_return Void{};
}
CoTryTask<void> FileSession::store(IReadWriteTransaction &txn) const {
XLOGF(DBG, "Store session {}", *this);
// TODO: what if two client generate the same sessionId? should we check its existence at first?
auto value = serde::serialize(*this);
auto keyByInode = SessionByInode::packKey(inodeId, sessionId);
CO_RETURN_ON_ERROR(co_await txn.set(keyByInode, value));
// auto keyByClient = SessionByClient::packKey(clientId.uuid, sessionId);
// CO_RETURN_ON_ERROR(co_await txn.set(keyByClient, value));
co_return Void{};
}
CoTryTask<void> FileSession::remove(IReadWriteTransaction &txn) const {
XLOGF(DBG, "Remove session {}", *this);
auto keyByInode = SessionByInode::packKey(inodeId, sessionId);
CO_RETURN_ON_ERROR(co_await txn.clear(keyByInode));
// auto keyByClient = SessionByClient::packKey(clientId.uuid, sessionId);
// CO_RETURN_ON_ERROR(co_await txn.clear(keyByClient));
co_return Void{};
}
CoTryTask<std::vector<FileSession>> FileSession::scan(IReadOnlyTransaction &txn,
size_t shard,
std::optional<FileSession> prev) {
if (shard >= kShard) {
co_return std::vector<FileSession>();
}
auto beginKey = SessionByInode::packKey(InodeId(shard), Uuid::max());
if (prev) {
auto prevKey = SessionByInode::packKey(prev->inodeId, prev->sessionId);
beginKey = std::max(beginKey, prevKey);
}
auto endKey = SessionByInode::packKey(InodeId(shard + 1), Uuid::zero());
if (shard + 1 >= kShard) {
endKey = SessionByInode::packKey(InodeId(~0ULL), Uuid::zero());
}
if (beginKey >= endKey) {
co_return std::vector<FileSession>();
}
auto result = co_await txn.getRange({beginKey, false}, {endKey, false}, 512);
CO_RETURN_ON_ERROR(result);
std::vector<FileSession> sessions;
for (auto &[key, value] : result->kvs) {
auto session = FileSession::unpack(key, value);
CO_RETURN_ON_ERROR(session);
sessions.push_back(*session);
}
co_return sessions;
}
// std::string FileSession::prefix(Uuid clientId) { return SessionByClient::prefixOf(clientId); }
//
// std::string FileSession::packKey(Uuid clientId, Uuid sessionId) {
// return SessionByClient::packKey(clientId, sessionId);
// }
//
// Result<std::pair<Uuid, Uuid>> FileSession::unpackByClientKey(std::string_view key) {
// return SessionByClient::unpackKey(key);
// }
//
// CoTryTask<std::optional<FileSession>> FileSession::load(IReadOnlyTransaction &txn, ClientId clientId, Uuid session) {
// co_return co_await loadSession<SessionByClient, Uuid>(txn, clientId.uuid, session);
// }
//
// CoTryTask<std::vector<FileSession>> FileSession::list(IReadOnlyTransaction &txn, Uuid clientId, bool snapshot) {
// co_return co_await listSessions<SessionByClient>(txn, clientId, snapshot);
// }
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,81 @@
#pragma once
#include <folly/logging/xlog.h>
#include <optional>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
#include "common/app/ClientId.h"
#include "common/kv/ITransaction.h"
#include "common/serde/Serde.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "common/utils/Uuid.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
namespace hf3fs::meta::server {
using kv::IReadOnlyTransaction;
using kv::IReadWriteTransaction;
struct FileSession {
SERDE_STRUCT_FIELD(inodeId, InodeId());
SERDE_STRUCT_FIELD(clientId, ClientId::zero());
SERDE_STRUCT_FIELD(sessionId, Uuid::zero());
SERDE_STRUCT_FIELD(timestamp, UtcTime());
SERDE_STRUCT_FIELD(payload, std::string()); // for placeholder
public:
static std::string prefix(InodeId inodeId);
static std::string packKey(InodeId inodeId, Uuid session);
static Result<std::pair<InodeId, Uuid>> unpackByInodeKey(std::string_view key);
static FileSession create(InodeId inodeId, SessionInfo session, UtcTime timestamp = UtcClock::now()) {
return {inodeId, session.client, session.session, timestamp};
}
static FileSession create(InodeId inodeId, ClientId clientId, Uuid sessionId, UtcTime timestamp = UtcClock::now()) {
return {inodeId, clientId, sessionId, timestamp};
}
static Result<FileSession> unpack(std::string_view key, std::string_view value);
static CoTryTask<std::optional<FileSession>> load(IReadOnlyTransaction &txn, InodeId inodeId, Uuid session);
static CoTryTask<std::vector<FileSession>> list(IReadOnlyTransaction &txn,
InodeId inodeId,
bool snapshot,
size_t limit = 0);
static CoTryTask<std::optional<FileSession>> snapshotCheckExists(IReadOnlyTransaction &txn, const InodeId inodeId);
static CoTryTask<std::optional<FileSession>> checkExists(IReadWriteTransaction &txn, const InodeId inodeId);
static CoTryTask<void> removeAll(IReadWriteTransaction &txn, InodeId inodeId);
static constexpr size_t kShard = 256;
static_assert(kShard == (1 << 8));
static CoTryTask<std::vector<FileSession>> scan(IReadOnlyTransaction &txn,
size_t shard,
std::optional<FileSession> prev);
CoTryTask<void> store(IReadWriteTransaction &txn) const;
CoTryTask<void> remove(IReadWriteTransaction &txn) const;
// prune, store FileSessions need to be pruned under special InodeId(-1)
static FileSession createPrune(ClientId clientId, Uuid sessionId) {
return FileSession::create(InodeId(-1), clientId, sessionId);
}
static CoTryTask<std::vector<FileSession>> listPrune(IReadOnlyTransaction &txn, size_t limit) {
co_return co_await FileSession::list(txn, InodeId(-1), true, limit);
}
// static CoTryTask<std::optional<FileSession>> load(IReadOnlyTransaction &txn, ClientId clientId, Uuid session);
// static CoTryTask<std::vector<FileSession>> list(IReadOnlyTransaction &txn, Uuid clientId, bool snapshot);
// static std::string prefix(Uuid clientId);
// static std::string packKey(Uuid clientId, Uuid session);
// static Result<std::pair<Uuid, Uuid>> unpackByClientKey(std::string_view key);
};
} // namespace hf3fs::meta::server

136
src/meta/store/Idempotent.h Normal file
View File

@@ -0,0 +1,136 @@
#pragma once
#include <folly/logging/xlog.h>
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/serde/MessagePacket.h"
#include "common/serde/Serde.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/Nameof.hpp"
#include "common/utils/Result.h"
#include "common/utils/SerDeser.h"
#include "common/utils/UtcTime.h"
#include "common/utils/Uuid.h"
#include "fbs/meta/Service.h"
namespace hf3fs::meta::server {
/** Store transaction result to ensure idempotency during retries. Currently used for remove operations.
*/
struct Idempotent {
static constexpr auto keyPrefix = kv::KeyPrefix::MetaIdempotent;
template <typename T>
struct Record {
Record() requires(std::is_same_v<T, Void>) = default;
explicit Record(const T &result)
: result(result) {}
SERDE_STRUCT_FIELD(clientId, Uuid::zero());
SERDE_STRUCT_FIELD(requestId, Uuid::zero());
SERDE_STRUCT_FIELD(timestamp, UtcTime());
SERDE_STRUCT_FIELD(result, serde::Payload<T>());
public:
std::string packKey() const {
// requestId + clientId to avoid hotspot
XLOGF_IF(FATAL, clientId == Uuid::zero() || requestId == Uuid::zero(), "invalid uuid");
return Serializer::serRawArgs(keyPrefix, requestId, clientId);
}
};
template <class T, class ReqInfo>
static CoTryTask<std::optional<Result<T>>> load(kv::IReadWriteTransaction &txn,
const Uuid clientId,
const Uuid requestId,
const ReqInfo &req) {
if (clientId == Uuid::zero() || requestId == Uuid::zero()) {
XLOGF(CRITICAL, "Request invalid uuid {} {}", clientId, requestId);
co_return makeError(StatusCode::kInvalidArg, "Invalid uuid");
}
Record<Void> record;
record.clientId = clientId;
record.requestId = requestId;
auto res = co_await txn.get(record.packKey());
CO_RETURN_ON_ERROR(res);
if (!res->has_value()) {
co_return std::nullopt;
}
auto desRes = serde::deserialize(record, res->value());
if (!desRes) {
XLOGF(DFATAL, "IdempotentRecord deserialize failed, request {}, error {}", req, desRes.error());
co_return makeError(StatusCode::kDataCorruption, "IdempotentRecord des failed");
}
if (record.clientId != clientId || record.requestId != requestId) {
XLOGF(DFATAL, "IdempotentRecord mismatch, request {}, record {}", req, record);
co_return makeError(MetaCode::kInconsistent, "IdempotentRecord uuid mismatch");
}
Result<T> result = makeError(StatusCode::kUnknown);
auto desResult = serde::deserialize(result, record.result);
if (!desResult) {
XLOGF(DFATAL, "IdempotentRecord deserialize result failed, request {}, error {}", req, desResult.error());
co_return makeError(StatusCode::kDataCorruption, "IdempotentRecord deserialize result failed");
}
XLOGF(CRITICAL, "Duplicated request {}, result {}, prev {}, now {}", req, result, record.timestamp, UtcTime::now());
co_return std::optional(result);
}
template <class T>
static CoTryTask<Void> store(kv::IReadWriteTransaction &txn,
const Uuid clientId,
const Uuid requestId,
const Result<T> &result) {
Record<Result<T>> record(result);
record.clientId = clientId;
record.requestId = requestId;
record.timestamp = UtcClock::now();
auto key = record.packKey();
auto value = serde::serialize(record);
co_return co_await txn.set(key, value);
}
static CoTryTask<std::pair<std::string, bool>> clean(kv::IReadWriteTransaction &txn,
std::optional<std::string> prev,
Duration expire,
size_t limit,
size_t &total,
size_t &cleaned) {
auto now = UtcClock::now();
auto prefix = Serializer::serRawArgs(keyPrefix);
auto begin = prev.value_or(prefix);
XLOGF_IF(FATAL, begin < prefix, "{} < {}", begin, prefix);
auto end = kv::TransactionHelper::prefixListEndKey(prefix);
kv::IReadOnlyTransaction::KeySelector selBegin{begin, false};
kv::IReadOnlyTransaction::KeySelector selEnd{end, false};
auto res = co_await txn.getRange(selBegin, selEnd, limit);
CO_RETURN_ON_ERROR(res);
total = res->kvs.size();
cleaned = 0;
for (const auto &kv : res->kvs) {
Record<Void> record;
auto des = serde::deserialize(record, kv.value);
if (!des) {
XLOGF(CRITICAL, "IdempotentRecord deserialize failed {}", des.error());
continue;
}
if (record.timestamp + expire < now) {
cleaned++;
CO_RETURN_ON_ERROR(co_await txn.clear(kv.key));
}
}
auto nextPrev = res->kvs.empty() ? begin : res->kvs.back().key;
co_return std::pair<std::string, bool>{nextPrev, res->hasMore};
}
};
} // namespace hf3fs::meta::server

226
src/meta/store/Inode.cc Normal file
View File

@@ -0,0 +1,226 @@
#include "meta/store/Inode.h"
#include <cassert>
#include <cstddef>
#include <fmt/core.h>
#include <folly/Likely.h>
#include <folly/futures/Future.h>
#include <folly/logging/xlog.h>
#include <linux/fs.h>
#include <map>
#include <optional>
#include <string_view>
#include <utility>
#include "common/kv/ITransaction.h"
#include "common/kv/KeyPrefix.h"
#include "common/serde/Serde.h"
#include "common/utils/Coroutine.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#include "common/utils/SerDeser.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "meta/store/BatchContext.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
/** Inode */
Result<Inode> Inode::newUnpacked(std::string_view key, std::string_view value) {
Inode inode;
RETURN_ON_ERROR(inode.unpackKey(key));
if (auto result = serde::deserialize(inode.data(), value); result.hasError()) {
XLOGF(CRITICAL, "Failed to deserialize inode value {}, data corruption!", result.error());
return makeError(StatusCode::kDataCorruption);
}
return std::move(inode);
}
std::string Inode::packKey(InodeId id) {
static constexpr auto prefix = kv::KeyPrefix::Inode;
auto inodeId = id.packKey();
return Serializer::serRawArgs(prefix, inodeId);
}
std::string Inode::packKey() const { return packKey(id); }
Result<Void> Inode::unpackKey(std::string_view key) {
kv::KeyPrefix prefix;
InodeId::Key inodeId;
if (auto result = Deserializer::deserRawArgs(key, prefix, inodeId); result.hasError()) {
XLOGF(CRITICAL, "Failed to deserialize inode key {}, data corruption!", result.error());
return makeError(StatusCode::kDataCorruption);
}
assert(prefix == kv::KeyPrefix::Inode);
id = InodeId::unpackKey(inodeId);
return Void{};
}
template <const bool SNAPSHOT>
CoTryTask<std::optional<Inode>> Inode::loadImpl(IReadOnlyTransaction &txn, InodeId id) {
auto func = SNAPSHOT ? &IReadOnlyTransaction::snapshotGet : &IReadOnlyTransaction::get;
auto result = co_await (txn.*func)(packKey(id));
if (result.hasError()) {
XLOGF(ERR, "Failed to load inode {}, error {}", id, result.error());
CO_RETURN_ERROR(result);
}
if (auto &value = *result; value.has_value()) {
Inode inode;
inode.id = id;
#ifndef NDEBUG
inode.snapshotLoaded_ = SNAPSHOT;
#endif
if (auto des = serde::deserialize(inode.data(), *value); des.hasError()) {
XLOGF(CRITICAL, "Failed to deserialize inode {}, {}, data corruption!!", inode.id, des.error());
co_return makeError(StatusCode::kDataCorruption, fmt::format("deserialize inode {} failed", id));
}
co_return std::move(inode);
} else {
co_return std::nullopt;
}
}
CoTryTask<std::optional<Inode>> Inode::snapshotLoad(IReadOnlyTransaction &txn, InodeId id) {
if (auto batch = BatchContext::get(); batch) {
auto guard = batch->loadInode(id);
if (guard.needLoad) {
auto v = co_await loadImpl<true>(txn, id);
guard.set(v);
co_return std::move(v);
} else {
co_return co_await guard.coAwait();
}
} else {
co_return co_await loadImpl<true>(txn, id);
}
}
CoTryTask<std::optional<Inode>> Inode::load(IReadOnlyTransaction &txn, InodeId id) {
co_return co_await loadImpl<false>(txn, id);
}
CoTryTask<void> Inode::store(IReadWriteTransaction &txn) const {
static const std::map<InodeId, Acl> treeRoots{{InodeId::root(), Acl::root()}, {InodeId::gcRoot(), Acl::gcRoot()}};
assert(!snapshotLoaded_);
if (treeRoots.contains(id)) {
if (!isDirectory() || (!asDirectory().name.empty() && asDirectory().name != "/")) {
XLOGF(DFATAL, "Store invalid root inode, {}", *this);
co_return makeError(MetaCode::kFoundBug, fmt::format("Store invalid special inode {}", *this));
}
auto expectedAcl = treeRoots.at(id);
expectedAcl.iflags = acl.iflags;
if (acl != expectedAcl) {
XLOGF(DFATAL, "Try change root inode {} acl to {}", id, acl);
co_return makeError(MetaCode::kNoPermission, fmt::format("try change root {} acl to {}", id, acl));
}
} else if (isDirectory()) {
auto &name = asDirectory().name;
if (UNLIKELY(name == "." || name == ".." || std::find(name.begin(), name.end(), '/') != name.end())) {
XLOGF(DFATAL, "DirEntry name {} is invalid, should never happen!!!", name);
co_return makeError(MetaCode::kFoundBug, fmt::format("Directory {} invalid DirEntry name {}", id, name));
}
} else if (isFile()) {
if (auto valid = asFile().layout.valid(false /*allowEmpty*/); valid.hasError()) {
XLOGF(DFATAL, "File {} has a invalid layout {}, error {}", id, asFile().layout, valid.error());
co_return makeError(MetaCode::kFoundBug,
fmt::format("File {} invalid layout {}, {}", id, asFile().layout, valid.error()));
}
}
auto key = packKey();
auto value = serde::serialize(data());
if (auto result = co_await txn.set(key, value); result.hasError()) {
XLOGF(ERR, "Failed to store inode {}, error {}", id, result.error());
co_return result;
}
XLOGF(DBG, "Store inode {}", id);
co_return Void{};
}
CoTryTask<void> Inode::remove(IReadWriteTransaction &txn) const {
assert(!snapshotLoaded_);
if (UNLIKELY(id.isTreeRoot())) {
XLOGF(DFATAL, "Don't allow remove tree root {}!", id);
co_return makeError(MetaCode::kFoundBug, "Try remove tree root");
}
if (UNLIKELY(acl.iflags & FS_IMMUTABLE_FL)) {
XLOGF(DFATAL, "Try remove inode {} with FS_IMMUTABLE_FL", id);
co_return makeError(MetaCode::kFoundBug, "Try remove inode with FS_IMMUTABLE_FL");
}
XLOGF(DBG, "Remove inode {}", id);
co_return co_await txn.clear(packKey());
}
CoTryTask<meta::DirEntry> Inode::snapshotLoadDirEntry(IReadOnlyTransaction &txn) const {
if (!isDirectory()) {
co_return makeError(MetaCode::kNotDirectory);
}
auto parent = asDirectory().parent;
auto name = asDirectory().name;
std::optional<DirEntry> entry;
if (!name.empty()) {
auto result = co_await DirEntry::snapshotLoad(txn, parent, name);
CO_RETURN_ON_ERROR(result);
entry = std::move(*result);
} else {
std::string prev;
while (true) {
auto entries = co_await DirEntryList::snapshotLoad(txn, parent, prev, -1, false);
CO_RETURN_ON_ERROR(entries);
for (auto &item : entries->entries) {
if (item.id == id) {
entry = std::move(item);
break;
}
}
if (!entries->more) break;
if (!entries->entries.empty()) prev = entries->entries.rbegin()->name;
}
}
if (!entry.has_value()) {
XLOGF(WARN, "DirEntry of directory {} not found, parent {}, path {}, maybe deleted!", id, parent, name);
co_return makeError(MetaCode::kNotFound);
} else if (entry->id != id) {
XLOGF(WARN, "InodeId of DirEntry {} != {}, maybe deleted", *entry, id);
co_return makeError(MetaCode::kNotFound);
}
co_return *entry;
}
CoTryTask<Void> Inode::loadAncestors(IReadWriteTransaction &txn, std::vector<Inode> &ancestors, InodeId parent) {
auto ancestorIds = std::set<InodeId>();
auto currAncestorId = parent;
FAULT_INJECTION_SET_FACTOR(4);
while (true) {
if (UNLIKELY(ancestorIds.contains(currAncestorId))) {
XLOGF(DFATAL, "Inode found duplicated ancestor, parent {}, duplicated {}", parent, currAncestorId);
co_return makeError(MetaCode::kInconsistent, "directory tree contains loop");
}
ancestorIds.insert(currAncestorId);
// NOTE: add dst's ancestors inode into read conflict set
auto currAncestor = (co_await Inode::load(txn, currAncestorId)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(currAncestor);
ancestors.push_back(*currAncestor);
if (UNLIKELY(!currAncestor->isDirectory())) {
XLOGF(DFATAL, "Entry {}, Inode {} is not directory", currAncestorId, *currAncestor);
co_return makeError(MetaCode::kNotDirectory);
}
if (currAncestor->asDirectory().parent == currAncestor->id) {
break;
}
currAncestorId = currAncestor->asDirectory().parent;
}
co_return Void{};
}
} // namespace hf3fs::meta::server

84
src/meta/store/Inode.h Normal file
View File

@@ -0,0 +1,84 @@
#pragma once
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <folly/Likely.h>
#include <folly/logging/xlog.h>
#include <optional>
#include <string>
#include <string_view>
#include <variant>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "common/utils/Uuid.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
namespace hf3fs::meta::server {
using hf3fs::kv::IReadOnlyTransaction;
using hf3fs::kv::IReadWriteTransaction;
class Inode : public meta::Inode {
public:
using Base = meta::Inode;
using Base::Base;
Inode(Base base)
: Base(std::move(base)) {}
Inode(InodeId id, Acl acl, UtcTime time, std::variant<File, Directory, Symlink> type)
: Base{id, InodeData{type, acl, 1, time, time, time}} {}
static Inode newFile(InodeId id, Acl acl, Layout layout, UtcTime time) { return Inode(id, acl, time, File(layout)); }
static Inode newDirectory(InodeId id, InodeId parent, std::string name, Acl acl, Layout layout, UtcTime time) {
return Inode(id, acl, time, Directory{parent, std::move(layout), std::move(name)});
}
static Inode newSymlink(InodeId id, Path target, Uid uid, Gid gid, UtcTime time) {
static constexpr Permission perm{0777}; // permission of symlink is never used, and won't changed
return Inode(id, Acl(uid, gid, perm), time, Symlink{std::move(target)});
}
/** key format: kInodePrefx + InodeId.key */
static std::string packKey(InodeId id);
std::string packKey() const;
Result<Void> unpackKey(std::string_view key);
static Result<Inode> newUnpacked(std::string_view key, std::string_view value);
// The difference of `snapshotLoad` and `load` is the former won't add key of inode into read conflict set.
static CoTryTask<std::optional<Inode>> snapshotLoad(IReadOnlyTransaction &txn, InodeId id);
static CoTryTask<std::optional<Inode>> load(IReadOnlyTransaction &txn, InodeId id);
CoTryTask<void> addIntoReadConflict(IReadWriteTransaction &txn) {
#ifndef NDEBUG
snapshotLoaded_ = false;
#endif
co_return co_await txn.addReadConflict(packKey());
}
CoTryTask<void> store(IReadWriteTransaction &txn) const;
/** Remove this inode */
CoTryTask<void> remove(IReadWriteTransaction &txn) const;
CoTryTask<DirEntry> snapshotLoadDirEntry(IReadOnlyTransaction &txn) const;
static CoTryTask<Void> loadAncestors(IReadWriteTransaction &txn, std::vector<Inode> &ancestors, InodeId parent);
private:
template <const bool SNAPSHOT>
static CoTryTask<std::optional<Inode>> loadImpl(IReadOnlyTransaction &txn, InodeId id);
#ifndef NDEBUG
mutable bool snapshotLoaded_ = false;
#endif
};
static_assert(serde::SerializableToJson<Inode>);
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,93 @@
#include "meta/store/MetaStore.h"
#include <cassert>
#include <fmt/core.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <utility>
#include "common/app/NodeId.h"
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Schema.h"
#include "fbs/mgmtd/ChainRef.h"
#include "meta/components/ChainAllocator.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/Operation.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
class InitFsOp : public IOperation<Void> {
public:
InitFsOp(ChainAllocator &chainAlloc, Layout rootLayout)
: chainAlloc_(chainAlloc),
rootLayout_(std::move(rootLayout)) {}
bool isReadOnly() final { return false; }
CoTryTask<Void> run(IReadWriteTransaction &txn) final {
XLOGF(INFO, "MetaStore::initFs");
auto valid = co_await chainAlloc_.checkLayoutValid(rootLayout_);
if (valid.hasError()) {
XLOGF(ERR, "RootLayout is not valid, {}", valid.error());
co_return makeError(std::move(valid.error()));
}
// check tree roots exist
auto exists = [](auto &val) { return val.has_value(); };
auto root = (co_await Inode::load(txn, InodeId::root())).then(exists);
auto gcRoot = (co_await Inode::load(txn, InodeId::gcRoot())).then(exists);
CO_RETURN_ON_ERROR(root);
CO_RETURN_ON_ERROR(gcRoot);
if (!*root) {
// no root, need create a root
// root Inode's parent is itself, this simplify path resolution: eg /../../../a -> /a
Inode root = Inode::newDirectory(InodeId::root(),
InodeId::root(),
"/",
Acl::root(),
rootLayout_,
UtcClock::now().castGranularity(1_ms));
CO_RETURN_ON_ERROR(co_await root.store(txn));
}
if (!*gcRoot) {
// no GC root, need create a GC root
Inode gcRoot = Inode::newDirectory(InodeId::gcRoot(),
InodeId::gcRoot(),
"/",
Acl::gcRoot(),
Layout() /* Invalid layout */,
UtcClock::now().castGranularity(1_ms));
CO_RETURN_ON_ERROR(co_await gcRoot.store(txn));
}
co_return Void();
}
// NOTE: these function won't be called in InitCluster.cc
void retry(const Status &) final {}
void finish(const Result<Void> &) final {}
private:
ChainAllocator &chainAlloc_;
Layout rootLayout_;
};
MetaStore::OpPtr<Void> MetaStore::initFileSystem(ChainAllocator &chainAlloc, Layout rootLayout) {
return std::make_unique<InitFsOp>(chainAlloc, rootLayout);
}
class BenchRpcOp : public ReadOnlyOperation<TestRpcRsp> {
public:
BenchRpcOp(MetaStore &store)
: ReadOnlyOperation<TestRpcRsp>::ReadOnlyOperation<TestRpcRsp>(store) {}
CoTryTask<TestRpcRsp> run(IReadOnlyTransaction &) override { co_return TestRpcRsp{}; }
};
MetaStore::OpPtr<TestRpcRsp> MetaStore::testRpc(const TestRpcReq &) { return std::make_unique<BenchRpcOp>(*this); }
} // namespace hf3fs::meta::server

158
src/meta/store/MetaStore.h Normal file
View File

@@ -0,0 +1,158 @@
#pragma once
#include <boost/core/ignore_unused.hpp>
#include <fcntl.h>
#include <folly/Likely.h>
#include <folly/lang/Bits.h>
#include <gtest/gtest_prod.h>
#include <memory>
#include <optional>
#include <queue>
#include <string_view>
#include <utility>
#include <variant>
#include <vector>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "client/storage/StorageClient.h"
#include "common/kv/IKVEngine.h"
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/monitor/Recorder.h"
#include "common/monitor/Sample.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/Status.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Service.h"
#include "meta/base/Config.h"
#include "meta/components/AclCache.h"
#include "meta/components/ChainAllocator.h"
#include "meta/components/FileHelper.h"
#include "meta/components/GcManager.h"
#include "meta/components/InodeIdAllocator.h"
#include "meta/components/SessionManager.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/PathResolve.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
using hf3fs::kv::IReadOnlyTransaction;
using hf3fs::kv::IReadWriteTransaction;
template <typename Rsp>
class IOperation {
public:
using RspT = Rsp;
virtual ~IOperation() = default;
virtual bool isReadOnly() = 0;
virtual bool retryMaybeCommitted() { return true; }
virtual bool needIdempotent(Uuid &clientId, Uuid &requestId) const {
boost::ignore_unused(clientId, requestId);
return false;
}
virtual std::string_view name() const { return "other"; }
virtual flat::Uid user() const { return flat::Uid(-1); }
virtual CoTryTask<Rsp> run(IReadWriteTransaction &) = 0;
virtual void retry(const Status &) = 0;
virtual void finish(const Result<Rsp> &) = 0;
CoTryTask<Rsp> operator()(IReadWriteTransaction &txn) { co_return co_await run(txn); }
};
class MetaStore {
public:
MetaStore(const Config &config,
analytics::StructuredTraceLog<MetaEventTrace> &metaEventTraceLog,
std::shared_ptr<Distributor> distributor,
std::shared_ptr<InodeIdAllocator> inodeAlloc,
std::shared_ptr<ChainAllocator> chainAlloc,
std::shared_ptr<FileHelper> fileHelper,
std::shared_ptr<SessionManager> sessionManager,
std::shared_ptr<GcManager> gcManager)
: config_(config),
metaEventTraceLog_(metaEventTraceLog),
distributor_(distributor),
inodeAlloc_(inodeAlloc),
chainAlloc_(chainAlloc),
fileHelper_(fileHelper),
sessionManager_(sessionManager),
gcManager_(gcManager),
aclCache_(2 << 20 /* 2m acl */) {}
auto &getEventTraceLog() { return metaEventTraceLog_; }
template <typename Rsp>
using Op = IOperation<Rsp>;
template <typename Rsp>
using OpPtr = std::unique_ptr<IOperation<Rsp>>;
static OpPtr<Void> initFileSystem(ChainAllocator &chainAlloc, Layout rootLayout);
OpPtr<Void> initFs(Layout rootLayout) { return MetaStore::initFileSystem(*chainAlloc_, rootLayout); }
OpPtr<StatFsRsp> statFs(const StatFsReq &req);
OpPtr<StatRsp> stat(const StatReq &req);
OpPtr<BatchStatRsp> batchStat(const BatchStatReq &req);
OpPtr<BatchStatByPathRsp> batchStatByPath(const BatchStatByPathReq &req);
OpPtr<GetRealPathRsp> getRealPath(const GetRealPathReq &req);
OpPtr<OpenRsp> open(OpenReq &req);
OpPtr<CreateRsp> tryOpen(CreateReq &req);
OpPtr<MkdirsRsp> mkdirs(const MkdirsReq &req);
OpPtr<SymlinkRsp> symlink(const SymlinkReq &req);
OpPtr<RemoveRsp> remove(const RemoveReq &req);
OpPtr<RenameRsp> rename(const RenameReq &req);
OpPtr<ListRsp> list(const ListReq &req);
OpPtr<SyncRsp> sync(const SyncReq &req);
OpPtr<HardLinkRsp> hardLink(const HardLinkReq &req);
OpPtr<SetAttrRsp> setAttr(const SetAttrReq &req);
OpPtr<PruneSessionRsp> pruneSession(const PruneSessionReq &req);
OpPtr<TestRpcRsp> testRpc(const TestRpcReq &req);
OpPtr<LockDirectoryRsp> lockDirectory(const LockDirectoryReq &req);
private:
template <typename>
FRIEND_TEST(TestRemove, GC);
template <typename Rsp>
friend class Operation;
const Config &config_;
analytics::StructuredTraceLog<MetaEventTrace> &metaEventTraceLog_;
std::shared_ptr<Distributor> distributor_;
std::shared_ptr<InodeIdAllocator> inodeAlloc_;
std::shared_ptr<ChainAllocator> chainAlloc_;
std::shared_ptr<FileHelper> fileHelper_;
std::shared_ptr<SessionManager> sessionManager_;
std::shared_ptr<GcManager> gcManager_;
AclCache aclCache_;
};
} // namespace hf3fs::meta::server

265
src/meta/store/Operation.h Normal file
View File

@@ -0,0 +1,265 @@
#pragma once
#include <cassert>
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <memory>
#include <optional>
#include <type_traits>
#include <utility>
#include "common/kv/ITransaction.h"
#include "common/monitor/Recorder.h"
#include "common/monitor/Sample.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fbs/meta/Utils.h"
#include "fdb/FDBRetryStrategy.h"
#include "fdb/FDBTransaction.h"
#include "meta/components/AclCache.h"
#include "meta/components/GcManager.h"
#include "meta/components/SessionManager.h"
#include "meta/event/Event.h"
#include "meta/store/Idempotent.h"
#include "meta/store/MetaStore.h"
#define OPERATION_TAGS(reqName) \
std::string_view name() const override { return MetaSerde<>::getRpcName(reqName); } \
flat::Uid user() const override { return reqName.user.uid; }
#define CHECK_REQUEST(reqName) \
do { \
if (auto result = reqName.valid(); UNLIKELY(result.hasError())) { \
auto rpcName = MetaSerde<>::getRpcName(reqName); \
XLOGF(WARN, "{} get invalid req, error {}", rpcName, result.error()); \
CO_RETURN_ERROR(result); \
} \
} while (0)
namespace hf3fs::meta::server {
template <typename Rsp>
class Operation : public IOperation<Rsp> {
public:
Operation(MetaStore &meta)
: meta_(meta) {}
bool isReadOnly() override { return false; }
CoTryTask<Rsp> run(IReadWriteTransaction &) override = 0;
void retry(const Status &) override { clearEvents(); }
void finish(const Result<Rsp> &result) override {
if (!result.hasError()) {
// success
for (const auto &event : events_) {
event.log();
}
for (const auto &trace : traces_) {
auto &traceLog = meta_.getEventTraceLog();
traceLog.append(trace);
}
}
}
protected:
const Config &config() const { return meta_.config_; }
InodeIdAllocator &inodeIdAlloc() { return *meta_.inodeAlloc_; }
ChainAllocator &chainAlloc() { return *meta_.chainAlloc_; }
FileHelper &fileHelper() { return *meta_.fileHelper_; }
SessionManager &sessionManager() { return *meta_.sessionManager_; }
GcManager &gcManager() { return *meta_.gcManager_; }
AclCache &aclCache() { return meta_.aclCache_; }
Distributor &distributor() { return *meta_.distributor_; }
UtcTime now() const { return UtcClock::now().castGranularity(config().time_granularity()); }
PathResolveOp resolve(IReadOnlyTransaction &txn, const UserInfo &user, Path *path = nullptr) {
return PathResolveOp(txn,
aclCache(),
user,
path,
config().max_symlink_count(),
config().max_symlink_depth(),
config().acl_cache_time());
}
CoTryTask<InodeId> allocateInodeId(IReadWriteTransaction &txn, bool newChunkEngine) {
auto newId = co_await inodeIdAlloc().allocate();
CO_RETURN_ON_ERROR(newId);
if (newChunkEngine) {
newId = InodeId::withNewChunkEngine(*newId);
}
if (config().inodeId_check_unique()) {
auto loadResult = co_await Inode::load(txn, *newId);
CO_RETURN_ON_ERROR(loadResult);
if (loadResult->has_value()) {
XLOGF_IF(FATAL,
config().inodeId_abort_on_duplicate(),
"InodeIdAllocator get duplicated InodeId {}",
newId.value());
XLOGF(DFATAL, "InodeIdAllocator get duplicated InodeId {}", newId.value());
co_return makeError(MetaCode::kInodeIdAllocFailed);
}
} else {
XLOGF_EVERY_MS(WARN, (300 * 1000), "inodeId_check_unique is disabled");
}
co_return newId;
}
void clearEvents() { events_.clear(); }
Event &addEvent(Event::Type type) {
events_.emplace_back(type);
return *events_.rbegin();
}
void addEvent(Event event) { events_.emplace_back(std::move(event)); }
void addTrace(MetaEventTrace &&trace) { traces_.emplace_back(std::move(trace)); }
MetaStore &meta_;
std::vector<Event> events_;
std::vector<MetaEventTrace> traces_;
};
template <typename Rsp>
class ReadOnlyOperation : public Operation<Rsp> {
public:
ReadOnlyOperation(MetaStore &meta)
: Operation<Rsp>(meta) {}
bool isReadOnly() final { return true; }
virtual CoTryTask<Rsp> run(IReadOnlyTransaction &) = 0;
CoTryTask<Rsp> run(IReadWriteTransaction &txn) final {
co_return co_await run(static_cast<IReadOnlyTransaction &>(txn));
}
};
template <typename Rsp, typename ReqInfo>
class OperationDriver {
public:
OperationDriver(MetaStore::Op<Rsp> &operation, const ReqInfo &req, std::optional<SteadyTime> deadline = std::nullopt)
: operation_(operation),
req_(req),
deadline_(deadline) {}
CoTryTask<Rsp> run(std::unique_ptr<kv::IReadWriteTransaction> txn,
kv::FDBRetryStrategy::Config config,
bool readonly,
bool enableGrvCache) {
config.retryMaybeCommitted = operation_.retryMaybeCommitted();
kv::FDBRetryStrategy strategy(config);
CO_RETURN_ON_ERROR(strategy.init(txn.get()));
OperationRecorder::Guard recorder(OperationRecorder::server(), operation_.name(), operation_.user());
if (readonly && !operation_.isReadOnly()) {
co_return makeError(StatusCode::kReadOnlyMode, "FileSystem is in readonly mode.");
}
auto grvCache = operation_.isReadOnly() && enableGrvCache;
if (grvCache && dynamic_cast<kv::FDBTransaction *>(txn.get())) {
auto fdbTxn = dynamic_cast<kv::FDBTransaction *>(txn.get());
CO_RETURN_ON_ERROR(fdbTxn->setOption(FDBTransactionOption::FDB_TR_OPTION_USE_GRV_CACHE, {}));
}
Result<Rsp> result = makeError(MetaCode::kOperationTimeout);
auto duplicate = false;
while (true) {
// check timeout
if (deadline_ && deadline_.value() <= SteadyClock::now()) {
XLOGF(ERR, "Request {} timeout, return error {}", describe(), result);
break;
}
// run operation
result = co_await runAndCommit(*txn, operation_, duplicate);
if (ErrorHandling::success(result)) {
break;
}
// retry
XLOGF(WARN, "Request {} failed, error {}", describe(), result.error());
operation_.retry(result.error());
auto retry = co_await strategy.onError(txn.get(), result.error());
if (retry.hasError()) {
result = makeError(retry.error());
break;
}
recorder.retry()++;
}
if (result.hasError() && result.error().code() == StatusCode::kOK) {
XLOGF(DFATAL, "Has error but error code is kOK, {}, {}", describe(), result);
result = makeError(MetaCode::kFoundBug);
}
recorder.finish(result, duplicate);
operation_.finish(result);
co_return result;
}
private:
#define IDEMPOTENT_CHECK() \
do { \
auto idemCheck = co_await Idempotent::load<Rsp>(txn, clientId, requestId, req_); \
CO_RETURN_ON_ERROR(idemCheck); \
duplicate = idemCheck->has_value(); \
if (duplicate) { \
co_return idemCheck->value(); \
} \
} while (0)
template <typename Handler>
std::invoke_result_t<Handler, IReadWriteTransaction &> runAndCommit(IReadWriteTransaction &txn,
Handler &&handler,
bool &duplicate) {
Uuid clientId, requestId;
auto readonly = handler.isReadOnly();
auto idem = !readonly && operation_.needIdempotent(clientId, requestId);
if (idem) {
OperationRecorder::server().addIdempotentCount();
IDEMPOTENT_CHECK();
auto result = co_await handler(txn);
if (result) {
CO_RETURN_ON_ERROR(co_await Idempotent::store(txn, clientId, requestId, result));
CO_RETURN_ON_ERROR(co_await txn.commit());
} else if (ErrorHandling::success(result) || !ErrorHandling::retryable(result.error())) {
// this is final result, discard other modifications and save result
txn.reset();
IDEMPOTENT_CHECK();
CO_RETURN_ON_ERROR(co_await Idempotent::store(txn, clientId, requestId, result));
CO_RETURN_ON_ERROR(co_await txn.commit());
}
co_return result;
} else {
auto result = co_await handler(txn);
if (!result.hasError() && !readonly) {
CO_RETURN_ON_ERROR(co_await txn.commit());
}
co_return result;
}
}
std::string describe() const {
if constexpr (std::is_base_of_v<ReqBase, ReqInfo>) {
return fmt::format("{}{}", operation_.name(), req_);
} else {
return std::string(operation_.name());
}
}
MetaStore::Op<Rsp> &operation_;
const ReqInfo &req_;
std::optional<SteadyTime> deadline_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,364 @@
#include "meta/store/PathResolve.h"
#include <cstddef>
#include <fcntl.h>
#include <folly/Overload.h>
#include <folly/ScopeGuard.h>
#include <folly/functional/Partial.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <iterator>
#include <map>
#include <numeric>
#include <variant>
#include "common/monitor/Recorder.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "meta/components/AclCache.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
namespace {
monitor::DistributionRecorder pathComponentsDist("meta_server.path_components");
monitor::DistributionRecorder pathComponentsDistUser("meta_server.path_components_by_user");
} // namespace
using ResolveResult = PathResolveOp::ResolveResult;
using ResolveRangeResult = PathResolveOp::ResolveRangeResult;
static CoTryTask<std::variant<std::pair<InodeId, Acl>, Inode, DirEntry>> loadParentAcl(
IReadOnlyTransaction &txn,
AclCache &cache,
const UserInfo &user,
const std::variant<InodeId, DirEntry> &parent,
Duration cacheTime) {
auto parentId = getInodeId(parent);
std::variant<std::pair<InodeId, Acl>, Inode, DirEntry> parentInfo;
if (std::holds_alternative<DirEntry>(parent)) {
// we already have Acl in DirEntry
parentInfo = std::get<DirEntry>(parent);
if (cacheTime.count() != 0 && std::get<DirEntry>(parent).isDirectory()) {
cache.set(parentId, getDirectoryAcl(parentInfo));
}
} else {
if (parentId == InodeId::root()) {
parentInfo = DirEntry::root();
} else {
auto cached = cache.get(parentId, cacheTime);
if (cached.has_value()) {
parentInfo = std::pair<InodeId, Acl>{parentId, *cached};
} else {
auto loadResult = (co_await Inode::snapshotLoad(txn, parentId)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(loadResult);
if (cacheTime.count() != 0 && loadResult->isDirectory()) {
cache.set(parentId, loadResult->acl);
}
parentInfo = std::move(*loadResult);
}
}
}
if (getInodeType(parentInfo) != InodeType::Directory) {
co_return makeError(MetaCode::kNotDirectory);
}
co_return parentInfo;
}
static CoTryTask<std::variant<std::pair<InodeId, Acl>, Inode, DirEntry>> loadAndCheckParentAcl(
IReadOnlyTransaction &txn,
AclCache &cache,
const UserInfo &user,
const std::variant<InodeId, DirEntry> &parent,
Duration cacheTime) {
auto acl = co_await loadParentAcl(txn, cache, user, parent, cacheTime);
CO_RETURN_ON_ERROR(acl);
CO_RETURN_ON_ERROR(getDirectoryAcl(*acl).checkPermission(user, AccessType::EXEC));
co_return acl;
}
PathResolveOp::~PathResolveOp() {
if (pathComponents_) {
pathComponentsDist.addSample(pathComponents_);
pathComponentsDistUser.addSample(pathComponents_, {{"uid", folly::to<std::string>(user_.uid.toUnderType())}});
}
}
CoTryTask<Inode> PathResolveOp::inode(const PathAt &path, AtFlags flags, bool checkRefCnt) {
Result<Inode> inode = makeError(MetaCode::kFoundBug);
if (!path.path.has_value() || (path.path->empty() && flags.contains(AT_EMPTY_PATH))) {
inode = (co_await Inode::snapshotLoad(txn_, path.parent)).then(checkMetaFound<Inode>);
} else {
auto entry = co_await this->dirEntry(path, flags);
CO_RETURN_ON_ERROR(entry);
assert(!flags.followLastSymlink() || !entry->isSymlink());
inode = co_await entry->snapshotLoadInode(txn_);
// XLOGF_IF(DFATAL, (inode.hasValue() && inode->nlink == 0), "entry {} -> inode {}, nlink == 0", *entry, *inode);
}
if (!inode.hasError() && checkRefCnt && inode->nlink == 0) {
co_return makeError(MetaCode::kNotFound,
fmt::format("path {}, inode {} is removed", path, inode->id.toHexString()));
}
co_return inode;
}
CoTryTask<DirEntry> PathResolveOp::dirEntry(const PathAt &path, AtFlags flags) {
if (!path.path.has_value()) {
co_return makeError(StatusCode::kInvalidArg, "path not set");
} else {
co_return co_await dirEntry(path.parent, *path.path, flags.followLastSymlink());
}
}
CoTryTask<DirEntry> PathResolveOp::dirEntry(InodeId parent, const Path &path, bool followLastSymlink) {
auto resolveResult = co_await this->path(parent, path);
CO_RETURN_ON_ERROR(resolveResult);
if (auto &entry = resolveResult->dirEntry; entry.has_value() && entry->isSymlink() && followLastSymlink) {
XLOGF(DBG, "Resolve dir entry get symlink, follow it.");
resolveResult = co_await this->symlink(*entry);
CO_RETURN_ON_ERROR(resolveResult);
}
if (!resolveResult->dirEntry.has_value()) {
co_return makeError(MetaCode::kNotFound);
}
co_return std::move(*resolveResult->dirEntry);
}
CoTryTask<ResolveResult> PathResolveOp::byDirectoryInodeId(InodeId inodeId) {
auto inode = (co_await Inode::snapshotLoad(txn_, inodeId)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(inode);
auto entry = co_await inode->snapshotLoadDirEntry(txn_);
CO_RETURN_ON_ERROR(entry);
auto parentAcl = co_await loadParentAcl(txn_, aclCache_, user_, entry->parent, aclCacheTime_);
CO_RETURN_ON_ERROR(parentAcl);
co_return ResolveResult(*parentAcl, *entry);
}
CoTryTask<ResolveResult> PathResolveOp::path(const PathAt &path, AtFlags flags) {
if (!path.path.has_value()) {
co_return makeError(StatusCode::kInvalidArg, "path not set");
} else {
co_return co_await this->path(path.parent, *path.path, flags.followLastSymlink());
}
}
/**
* Resolve path, return parentInode of last path component and dirEntry if presents.
* Don't follow last symlink link.
*/
CoTryTask<ResolveResult> PathResolveOp::path(InodeId parent, const Path &path) {
XLOGF(DBG, "Resolve path {}/{}", parent, path);
auto begin = path.begin();
auto resolveResult = co_await this->pathRange(parent, begin, path.end());
CO_RETURN_ON_ERROR(resolveResult);
XLOGF(DBG,
"Resolve path {}, {} components, {} found, {} missing",
path,
std::distance(path.begin(), path.end()),
std::distance(path.begin(), begin),
std::distance(begin, path.end()));
if (begin == path.end() || ++begin == path.end()) {
co_return resolveResult;
} else {
// some middle path components missing, return kNotFound
co_return makeError(MetaCode::kNotFound);
}
}
CoTryTask<ResolveResult> PathResolveOp::path(InodeId parent, const Path &path, bool followLastSymlink) {
auto resolveResult = co_await this->path(parent, path);
CO_RETURN_ON_ERROR(resolveResult);
if (auto &entry = resolveResult->dirEntry; followLastSymlink && entry.has_value() && entry->isSymlink()) {
co_return co_await symlink(*entry);
}
co_return resolveResult;
}
CoTryTask<ResolveRangeResult> PathResolveOp::pathRange(const PathAt &path) {
if (!path.path.has_value()) {
co_return makeError(StatusCode::kInvalidArg, "path not set");
} else {
auto begin = path.path->begin();
auto end = path.path->end();
auto result = co_await pathRange(path.parent, begin, end);
CO_RETURN_ON_ERROR(result);
co_return ResolveRangeResult(std::move(*result), std::accumulate(begin, end, Path(), std::divides()));
}
}
/**
* Walk along and resolve path components from begin to end.
* If parent doesn't exists or path is empty, return kNotFound.
* If parent is symlink, return kNotDirectory, if any middle path component points to a symlink, try to resolve it.
* If parent or any middle path component is symlink that points to a non-exist path, return kNotFound.
* If parent or any middle path component is a file, return kNotDirectory.
* If parent or any middle path component points to a deleted directory, return kNotFound.
* If user does not have search permission on directory, return kNoPermission.
* If there is too much symlink during path resolution, return kTooManySymlinks.
* If any path component is missing, begin will points to corresponding path component, and returns its parent Inode.
* If resolution success, begin will point to end and returns last parent Inode and DirEntry.
*/
CoTryTask<ResolveResult> PathResolveOp::pathRange(InodeId parentId,
Path::const_iterator &begin,
const Path::const_iterator &end) {
SCOPE_EXIT {
auto dis = std::distance(begin, end);
XLOGF_IF(DBG,
dis,
"PathResolveOp::pathRange {} components missing, {}!",
dis,
std::accumulate(begin, end, Path(), std::divides()));
};
std::variant<InodeId, DirEntry> parent(parentId);
if (begin == end) {
co_return makeError(MetaCode::kNotFound);
}
if (*begin == "/") {
// lookup from root;
parent = InodeId::root();
if (++begin == end) {
if (trace_) {
*trace_ = "/";
}
// special case: path range only contains "/", just load root inode and make a fake directory entry
co_return ResolveResult(DirEntry::root(), DirEntry::root());
}
}
FAULT_INJECTION_SET_FACTOR(std::distance(begin, end));
while (begin != end) {
// do not need to handle "."
if (begin->filename_is_dot()) {
if (++begin == end) {
co_return co_await pathComponent(parent, ".");
}
continue;
}
// resolve current path component
auto resolveResult = co_await pathComponent(parent, *begin);
CO_RETURN_ON_ERROR(resolveResult);
if (!resolveResult->dirEntry.has_value()) {
// dirEntry not found, just means this path component is missing,
// return parentInode and let caller decide create missing path components or not
co_return resolveResult;
}
if (++begin == end) {
// this is last component, return here
co_return resolveResult;
}
// middle path component
if (resolveResult->dirEntry->isSymlink()) {
resolveResult = co_await this->symlink(*resolveResult->dirEntry);
CO_RETURN_ON_ERROR(resolveResult);
if (!resolveResult->dirEntry.has_value()) {
co_return makeError(MetaCode::kNotFound);
}
}
if (resolveResult->dirEntry->isFile()) {
co_return makeError(MetaCode::kNotDirectory);
}
// update parent and continue.
parent = std::move(resolveResult->dirEntry.value());
}
__builtin_unreachable();
}
/**
* Resolve a single path component.
* If parent doesn't exist, return kNotFound or kInconsistent.
* If parent exists but is deleted, return kNotFound.
* If parent is file or symlink, return kNotDirectory.
* If parent is directory, but user doesn't have search permission, return kNoPermission.
* Else return parentInode and dirEntry if exists.
*/
CoTryTask<ResolveResult> PathResolveOp::pathComponent(const std::variant<InodeId, DirEntry> &parent, const Path &name) {
// todo: For each directory, we need load it's Inode to check permission,
// this adds performance overhead to path resolution.
// A simple way to mitigate this is cache Inode permission information,
// if we can tolerate chmod doesn't make effect for several seconds.
auto parentId = getInodeId(parent);
if (trace_) {
if (parentId == InodeId::root()) {
// todo: many for other root?
*trace_ = "/";
}
*trace_ /= name;
}
if (!name.filename_is_dot()) pathComponents_++;
auto result = co_await loadAndCheckParentAcl(txn_, aclCache_, user_, parent, aclCacheTime_);
CO_RETURN_ON_ERROR(result);
if (name.filename_is_dot()) {
DirEntry dirEntry = DirEntry::newDirectory(parentId, ".", parentId, getDirectoryAcl(*result));
co_return ResolveResult(*result, dirEntry);
} else if (name.filename_is_dot_dot()) {
if (std::holds_alternative<Inode>(*result)) {
auto &parent = std::get<Inode>(*result);
auto ppId = parent.asDirectory().parent;
auto parentAcl = parent.acl;
co_return ResolveResult(std::move(parent), DirEntry::newDirectory(parentId, "..", ppId, parentAcl));
} else {
auto loadInodeResult = (co_await Inode::snapshotLoad(txn_, getInodeId(*result))).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(loadInodeResult);
auto &parent = *loadInodeResult;
auto ppId = parent.asDirectory().parent;
auto parentAcl = parent.acl;
co_return ResolveResult(std::move(parent), DirEntry::newDirectory(parentId, "..", ppId, parentAcl));
}
} else {
auto loadEntryResult = co_await DirEntry::snapshotLoad(txn_, parentId, name.native());
CO_RETURN_ON_ERROR(loadEntryResult);
co_return ResolveResult(std::move(*result), std::move(loadEntryResult.value()));
}
}
static CoTryTask<Path> loadSymLinkTarget(IReadOnlyTransaction &txn, const DirEntry &entry) {
auto symlinkResult = co_await entry.snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(symlinkResult);
co_return std::move(symlinkResult.value().asSymlink().target);
}
CoTryTask<ResolveResult> PathResolveOp::symlink(DirEntry entry) {
if (++depth_ > maxSymlinkDepth_) {
co_return makeError(MetaCode::kTooManySymlinks);
}
SCOPE_EXIT { depth_--; };
while (true) {
if (++symlinkCnt_ > maxSymlinkCount_) {
co_return makeError(MetaCode::kTooManySymlinks);
}
auto symlinkTarget = co_await loadSymLinkTarget(txn_, entry);
CO_RETURN_ON_ERROR(symlinkTarget);
if (trace_) {
trace_->remove_filename();
}
auto resolveResult = co_await this->path(entry.parent, *symlinkTarget);
CO_RETURN_ON_ERROR(resolveResult);
if (!resolveResult->dirEntry.has_value() || !resolveResult->dirEntry->isSymlink()) {
co_return resolveResult;
}
entry = std::move(*resolveResult->dirEntry);
}
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,116 @@
#pragma once
#include <folly/Overload.h>
#include <folly/Utility.h>
#include <gtest/gtest_prod.h>
#include <optional>
#include <utility>
#include <variant>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/Path.h"
#include "fbs/meta/Common.h"
#include "meta/components/AclCache.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
/**
* Path Resolution.
*
* Note: PathResolveOp always use snapshotLoad, so it won't add any key into read conflict set.
* User should add keys into read conflict set manually if needed.
*/
class PathResolveOp : folly::NonCopyableNonMovable {
public:
struct ResolveResult {
// for parent, we may already got it's Inode, or just dirEntry points to it, or just cached acl
std::variant<std::pair<InodeId, Acl>, Inode, DirEntry> parent;
std::optional<DirEntry> dirEntry;
ResolveResult(std::variant<std::pair<InodeId, Acl>, Inode, DirEntry> parent, std::optional<DirEntry> dirEntry)
: parent(std::move(parent)),
dirEntry(std::move(dirEntry)) {}
InodeId getParentId() const { return getInodeId(parent); }
Acl getParentAcl() const { return getDirectoryAcl(parent); }
CoTryTask<Inode> getParentInode(kv::IReadOnlyTransaction &txn) const {
if (std::holds_alternative<Inode>(parent)) {
co_return std::get<Inode>(parent);
} else if (std::holds_alternative<DirEntry>(parent)) {
co_return co_await std::get<DirEntry>(parent).snapshotLoadInode(txn);
} else {
auto parentId = std::get<std::pair<InodeId, Acl>>(parent).first;
co_return (co_await Inode::snapshotLoad(txn, parentId)).then(checkMetaFound<Inode>);
}
}
};
struct ResolveRangeResult : ResolveResult {
Path missing;
ResolveRangeResult(ResolveResult result, Path missing)
: ResolveResult(std::move(result)),
missing(missing) {}
};
PathResolveOp(IReadOnlyTransaction &txn, AclCache &aclCache, const UserInfo &userInfo, Path *trace = nullptr)
: PathResolveOp(txn, aclCache, userInfo, trace, 4, 8, 5_s) {}
PathResolveOp(IReadOnlyTransaction &txn,
AclCache &aclCache,
const UserInfo &userInfo,
Path *trace,
size_t maxSymlinkCount,
size_t maxSymlinkDepth,
Duration aclCacheTime)
: txn_(txn),
user_(userInfo),
aclCache_(aclCache),
trace_(trace),
depth_(0),
symlinkCnt_(0),
maxSymlinkCount_(maxSymlinkCount),
maxSymlinkDepth_(maxSymlinkDepth),
aclCacheTime_(aclCacheTime),
pathComponents_(0) {}
~PathResolveOp();
CoTryTask<Inode> inode(const PathAt &path, AtFlags flags, bool checkRefCnt);
CoTryTask<DirEntry> dirEntry(const PathAt &path, AtFlags flags);
CoTryTask<ResolveResult> path(const PathAt &path, AtFlags flags);
CoTryTask<ResolveResult> byDirectoryInodeId(InodeId inodeId);
CoTryTask<ResolveRangeResult> pathRange(const PathAt &path);
CoTryTask<ResolveResult> symlink(DirEntry entry);
private:
template <typename>
FRIEND_TEST(TestResolve, ResolveComponent);
CoTryTask<DirEntry> dirEntry(InodeId parent, const Path &path, bool followLastSymlink);
CoTryTask<ResolveResult> path(InodeId parent, const Path &path);
CoTryTask<ResolveResult> path(InodeId parent, const Path &path, bool followLastSymlink);
CoTryTask<ResolveResult> pathComponent(const std::variant<InodeId, DirEntry> &parent, const Path &name);
CoTryTask<ResolveResult> pathRange(InodeId parent, Path::const_iterator &begin, const Path::const_iterator &end);
IReadOnlyTransaction &txn_;
const UserInfo &user_;
AclCache &aclCache_;
Path *trace_;
size_t depth_;
size_t symlinkCnt_;
size_t maxSymlinkCount_;
size_t maxSymlinkDepth_;
Duration aclCacheTime_;
size_t pathComponents_;
};
} // namespace hf3fs::meta::server

78
src/meta/store/Utils.h Normal file
View File

@@ -0,0 +1,78 @@
#pragma once
#include <algorithm>
#include <cassert>
#include <fcntl.h>
#include <folly/Overload.h>
#include <folly/logging/xlog.h>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "client/mgmtd/ICommonMgmtdClient.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/StatusCode.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
namespace hf3fs::meta::server {
template <typename T>
inline InodeId getInodeId(T &&val) {
return folly::variant_match(
std::forward<T>(val),
[](const Inode &inode) { return inode.id; },
[](const DirEntry &entry) { return entry.id; },
[](const InodeId &id) { return id; },
[](const std::pair<InodeId, Acl> &cachedAcl) { return cachedAcl.first; });
}
template <typename T>
inline Acl getDirectoryAcl(T &&val) {
return folly::variant_match(
std::forward<T>(val),
[](const Inode &inode) {
assert(inode.isDirectory());
return inode.acl;
},
[](const DirEntry &entry) {
assert(entry.isDirectory() && entry.dirAcl.has_value());
return *entry.dirAcl;
},
[](const std::pair<InodeId, Acl> &cachedAcl) { return cachedAcl.second; });
}
template <typename T>
inline InodeType getInodeType(T &&val) {
return folly::variant_match(
std::forward<T>(val),
[](const Inode &inode) { return inode.getType(); },
[](const DirEntry &entry) { return entry.type; },
[](const std::pair<InodeId, Acl> &) { return InodeType::Directory; });
}
template <typename T>
inline Result<T> checkMetaFound(std::optional<T> val) {
if (!val.has_value()) {
return makeError(MetaCode::kNotFound);
}
return std::move(val.value());
}
inline bool isFirstMeta(client::ICommonMgmtdClient &mgmtd, flat::NodeId nodeId) {
auto routing = mgmtd.getRoutingInfo();
if (!routing) {
return false;
}
auto nodes = routing->getNodeBy(flat::selectNodeByType(flat::NodeType::META) && flat::selectActiveNode());
auto first =
std::min_element(nodes.begin(), nodes.end(), [](auto &a, auto &b) { return a.app.nodeId < b.app.nodeId; });
return first != nodes.end() && first->app.nodeId == nodeId;
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,751 @@
#include "BatchOperation.h"
#include <algorithm>
#include <bits/ranges_algo.h>
#include <boost/iterator/transform_iterator.hpp>
#include <cassert>
#include <exception>
#include <fcntl.h>
#include <folly/Likely.h>
#include <folly/Range.h>
#include <folly/ScopeGuard.h>
#include <folly/Synchronized.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/functional/Partial.h>
#include <folly/futures/Future.h>
#include <folly/io/async/Request.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <iterator>
#include <map>
#include <optional>
#include <ranges>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/OptionalUtils.h"
#include "common/utils/Result.h"
#include "common/utils/StatusCode.h"
#include "common/utils/UtcTime.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fbs/meta/Service.h"
#include "meta/event/Event.h"
#include "meta/store/Inode.h"
#include "meta/store/Operation.h"
#include "meta/store/ops/SetAttr.h"
#define CO_RETURN_ON_TXN_ERROR(result) \
do { \
auto &&_r = result; \
if (_r.hasError() && StatusCode::typeOf(_r.error().code()) == StatusCodeType::Transaction) { \
CO_RETURN_ERROR(_r); \
} \
} while (0)
namespace hf3fs::meta::server {
namespace {
monitor::CountRecorder batchCnt("meta_server.batch_op_size");
}
extern monitor::CountRecorder openWrite;
/** BatchedOp */
CoTryTask<Inode> BatchedOp::run(IReadWriteTransaction &txn) {
auto dist = co_await distributor().checkOnServer(txn, inodeId_);
CO_RETURN_ON_ERROR(dist);
auto [ok, versionstamp] = *dist;
if (!ok) {
XLOGF(INFO, "inode {} not on current server, need retry", inodeId_);
co_return makeError(MetaCode::kBusy, "inode not on server, retry");
}
auto inode = (co_await Inode::snapshotLoad(txn, inodeId_)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(inode);
// sanity check for file length, if we hold the lock, and versionstamp not changed file length shouldn't changed
if (inode->isFile()) {
if (versionstamp != versionstamp_) {
currLength_ = inode->asFile().getVersionedLength();
nextLength_ = std::nullopt;
versionstamp_ = versionstamp;
}
if (currLength_ != inode->asFile().getVersionedLength() && nextLength_ != inode->asFile().getVersionedLength()) {
// we should never see this if all meta server is up to date
XLOGF(DFATAL,
"file {} length updated during operation, {} != {}",
*currLength_,
inode->asFile().getVersionedLength());
co_return makeError(MetaCode::kBusy, "length updated during operation, retry");
}
}
// handle all sync and close operation
auto r1 = co_await syncAndClose(txn, *inode);
CO_RETURN_ON_ERROR(r1);
auto r2 = co_await setAttr(txn, *inode);
CO_RETURN_ON_ERROR(r2);
auto r3 = co_await create(txn, *inode);
CO_RETURN_ON_ERROR(r3);
auto dirty = *r1 || *r2 || *r3;
if (dirty) {
// NOTE: add inode into read conflict set
CO_RETURN_ON_ERROR(co_await inode->addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await inode->store(txn));
}
co_return *inode;
}
CoTryTask<bool> BatchedOp::syncAndClose(IReadWriteTransaction &txn, Inode &inode) {
std::vector<FileSession> sessions;
std::optional<VersionedLength> hintLength;
bool updateLength = false;
bool truncate = false;
bool dirty = false;
// initial hint length
hintLength = meta::VersionedLength{0, 0};
// merge all requests
for (auto &waiter : syncs_) {
auto result = co_await sync(inode, waiter.get().req, updateLength, truncate, hintLength);
if (result.hasError()) {
waiter.get().result = makeError(std::move(result.error()));
} else {
dirty |= *result;
}
}
for (auto &waiter : closes_) {
auto result = co_await close(inode, waiter.get().req, updateLength, hintLength, sessions);
if (result.hasError()) {
waiter.get().result = makeError(std::move(result.error()));
} else {
dirty |= *result;
}
}
if (truncate) {
// ignore hint length when truncate happened
hintLength = std::nullopt;
updateLength = true;
}
// remove sessions
for (auto &session : sessions) {
CO_RETURN_ON_ERROR(co_await session.remove(txn));
}
if (!updateLength) {
// we don't need updateLength, just return
co_return dirty;
}
if (!inode.isFile()) {
XLOGF(DFATAL, "{} updateLength but not file, shouldn't happen", inode);
co_return makeError(MetaCode::kFoundBug, "updateLength but not file");
}
auto newLength = co_await queryLength(inode, hintLength, truncate);
CO_RETURN_ON_ERROR(newLength);
nextLength_ = *newLength;
if (*newLength != inode.asFile().getVersionedLength()) {
XLOGF_IF(FATAL,
(newLength->truncateVer < inode.asFile().truncateVer ||
(newLength->truncateVer == inode.asFile().truncateVer && newLength->length < inode.asFile().length)),
"file {}, newLength {} currLength {}",
inode.id,
*newLength,
inode.asFile().getVersionedLength());
XLOGF(DBG, "{} changed, {} != {}", inode.id, *newLength, inode.asFile().getVersionedLength());
SetAttr::update(inode.mtime, UtcClock::now(), config().time_granularity(), true);
if (newLength->truncateVer != inode.asFile().truncateVer) {
SetAttr::update(inode.ctime, UtcClock::now(), config().time_granularity(), true);
}
inode.asFile().setVersionedLength(*newLength);
dirty = true;
} else {
XLOGF(DBG,
"{} length not changed, length {} == {}, {} {}",
inode.id,
*newLength,
inode.asFile().getVersionedLength(),
*newLength != inode.asFile().getVersionedLength(),
*newLength == inode.asFile().getVersionedLength());
}
co_return dirty;
}
CoTryTask<bool> BatchedOp::sync(Inode &inode,
const SyncReq &req,
bool &updateLength,
bool &truncate,
std::optional<VersionedLength> &hintLength) {
// check request
CO_RETURN_ON_ERROR(req.valid());
if (req.inode != PathAt(inodeId_)) {
XLOGF(DFATAL, "SyncReq {} shouldn't in batch of {}", req, inodeId_);
co_return makeError(MetaCode::kFoundBug, "Invalid batchOp");
}
if ((req.updateLength || req.truncated || req.lengthHint) && !inode.isFile()) {
co_return makeError(MetaCode::kNotFile, "update length but not file");
}
if (req.lengthHint && req.lengthHint->truncateVer > inode.asFile().truncateVer) {
auto msg = fmt::format("inode {} hint truncateVer {} > current truncateVer {}",
inodeId_,
req.lengthHint->truncateVer,
inode.asFile().truncateVer);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kFoundBug, std::move(msg));
}
bool dirty = false;
dirty |= SetAttr::update(inode.atime, req.atime, config().time_granularity(), true /* cmp */);
dirty |= SetAttr::update(inode.mtime, req.mtime, config().time_granularity(), true /* cmp */);
if (req.truncated) {
dirty |=
SetAttr::update(inode.ctime, req.mtime.value_or(UtcClock::now()), config().time_granularity(), true /* cmp */);
}
updateLength |= req.updateLength;
if (req.updateLength) {
hintLength = VersionedLength::mergeHint(hintLength, req.lengthHint);
}
truncate |= req.truncated;
co_return dirty;
}
CoTryTask<bool> BatchedOp::close(Inode &inode,
const CloseReq &req,
bool &updateLength,
std::optional<VersionedLength> &hintLength,
std::vector<FileSession> &sessions) {
CO_RETURN_ON_ERROR(req.valid());
if (req.inode != PathAt(inodeId_)) {
XLOGF(DFATAL, "CloseReq {} shouldn't batch of {}", req, inodeId_);
co_return makeError(MetaCode::kFoundBug, "Invalid batchOp");
}
if ((req.session || req.updateLength) && !inode.isFile()) {
co_return makeError(MetaCode::kNotFile);
}
bool dirty = false;
dirty |= SetAttr::update(inode.atime, req.atime, config().time_granularity(), true /* cmp */);
dirty |= SetAttr::update(inode.mtime, req.mtime, config().time_granularity(), true /* cmp */);
updateLength |= req.updateLength;
if (req.updateLength) {
hintLength = VersionedLength::mergeHint(hintLength, req.lengthHint);
}
if (req.session.has_value()) {
sessions.push_back(FileSession::create(inode.id, *req.session));
}
co_return dirty;
}
CoTryTask<VersionedLength> BatchedOp::queryLength(const Inode &inode,
std::optional<VersionedLength> hintLength,
bool truncate) {
XLOGF_IF(FATAL, !inode.isFile(), "not file");
XLOGF_IF(FATAL, truncate && hintLength, "truncate but hintLength {}", *hintLength);
if (nextLength_) {
XLOGF(DBG, "inode {} update to cached nextLength {}", inode, *nextLength_);
co_return *nextLength_;
}
auto currLength = inode.asFile().getVersionedLength();
if (hintLength && !config().ignore_length_hint()) {
if (currLength.truncateVer >= hintLength->truncateVer && currLength.length >= hintLength->length) {
XLOGF(DBG, "don't need update {}, current {}, hint {}", inode.id, currLength, *hintLength);
co_return currLength;
}
if (hintLength->truncateVer == currLength.truncateVer && hintLength->length > currLength.truncateVer) {
XLOGF(DBG, "update {} to hint {}, current {}", inode.id, *hintLength, currLength);
co_return *hintLength;
}
XLOGF_IF(DFATAL,
hintLength->truncateVer > currLength.truncateVer,
"file {}, hint {} > {}!!!",
inode.id,
hintLength->truncateVer,
currLength.truncateVer);
}
XLOGF(DBG,
"need query length for {}, current {}, hint {}, ignore hint {}, truncate {}, sync {}, close {}",
inode.id,
currLength,
OptionalFmt(hintLength),
config().ignore_length_hint(),
truncate,
syncs_.size(),
closes_.size());
auto length = co_await fileHelper().queryLength(flat::UserInfo(user_), inode);
CO_RETURN_ON_ERROR(length);
XLOGF(DBG, "qeury length for {}, get {}", inode.id, *length);
auto truncateVer =
(truncate || *length < inode.asFile().length) ? inode.asFile().truncateVer + 1 : inode.asFile().truncateVer;
co_return VersionedLength{*length, truncateVer};
}
CoTryTask<bool> BatchedOp::setAttr(IReadWriteTransaction &txn, Inode &inode) {
auto dirty = false;
auto oldAcl = inode.acl;
for (auto &waiter : setattrs_) {
const auto &req = waiter.get().req;
if (req.path != PathAt(inodeId_)) {
XLOGF(DFATAL, "SetAttrReq {} shouldn't in batch of {}", req, inodeId_);
co_return makeError(MetaCode::kFoundBug, "Invalid batchOp");
}
auto result = SetAttr::check(inode, req, config());
if (result.hasError()) {
waiter.get().result = makeError(result.error());
} else {
dirty |= SetAttr::apply(inode, req, config().time_granularity(), config().dynamic_stripe_growth());
}
}
if (inode.isDirectory() && inode.acl != oldAcl && inode.id != InodeId::root()) {
XLOGF_IF(FATAL, !dirty, "acl changed but dirty not set");
auto result = co_await inode.snapshotLoadDirEntry(txn);
CO_RETURN_ON_ERROR(result);
auto entry = DirEntry(*result);
XLOGF_IF(DFATAL,
(!inode.asDirectory().name.empty() && entry.name != inode.asDirectory().name),
"{} != {}",
entry.name,
inode.asDirectory().name);
if (inode.asDirectory().name.empty()) {
inode.asDirectory().name = entry.name;
}
entry.dirAcl = inode.acl;
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await entry.store(txn));
}
co_return dirty;
}
CoTryTask<bool> BatchedOp::create(IReadWriteTransaction &txn, Inode &inode) {
if (creates_.empty()) {
co_return false;
}
if (!inode.isDirectory()) {
for (auto &waiter : creates_) {
waiter.get().result = makeError(MetaCode::kNotDirectory);
}
co_return false;
}
folly::Synchronized<uint32_t> chainAllocCounter(inode.asDirectory().chainAllocCounter);
if (creates_.size() == 1) {
auto result = co_await create(txn, inode, chainAllocCounter, creates_.begin(), creates_.end());
CO_RETURN_ON_ERROR(result);
co_return SetAttr::update(inode.asDirectory().chainAllocCounter, *chainAllocCounter.rlock()) || *result;
}
std::multimap<std::string, WaiterRef<CreateReq, CreateRsp>> map;
for (auto &waiter : creates_) {
const auto &name = waiter.get().req.path.path;
if (UNLIKELY(!name || name->has_parent_path())) {
auto msg = fmt::format("inode {}, create req {}", inodeId_, waiter.get().req);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kFoundBug, std::move(msg));
}
map.insert({name->string(), waiter});
}
std::vector<folly::SemiFuture<Result<bool>>> tasks;
auto exec = co_await folly::coro::co_current_executor;
auto dirty = false;
auto convert = [](auto &iter) { return iter.second; };
for (auto begin = map.begin(), end = std::next(begin); begin != map.end(); begin = end) {
while (end != map.end() && end->first == begin->first) end++;
auto ibegin = boost::make_transform_iterator(begin, convert);
auto iend = boost::make_transform_iterator(end, convert);
tasks.push_back(create(txn, inode, chainAllocCounter, ibegin, iend).scheduleOn(exec).start());
if (tasks.size() >= 8 || end == map.end()) {
auto results = co_await folly::coro::collectAllRange(std::exchange(tasks, {}));
for (auto &res : results) {
CO_RETURN_ON_ERROR(res);
dirty |= *res;
}
}
}
assert(tasks.empty());
dirty |= SetAttr::update(inode.asDirectory().chainAllocCounter, *chainAllocCounter.rlock());
co_return dirty;
}
CoTryTask<bool> BatchedOp::create(IReadWriteTransaction &txn,
const Inode &parent,
folly::Synchronized<uint32_t> &chainAllocCounter,
auto begin,
auto end) {
if (begin == end) {
co_return false;
}
const auto &path = begin->get().req.path;
XLOGF_IF(FATAL,
(parent.id != inodeId_ || path.parent != inodeId_ || !path.path || path.path->has_parent_path()),
"{}, {}, {}",
parent.id,
inodeId_,
path);
const auto &name = path.path->string();
auto entry = co_await DirEntry::snapshotLoad(txn, inodeId_, name);
CO_RETURN_ON_ERROR(entry);
XLOGF(DBG, "entry {}/{} -> {}", inodeId_, name, OptionalFmt(*entry));
if (entry->has_value()) {
auto inode = co_await entry->value().snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return (co_await openExists(txn, *inode, **entry, begin, end)).then([](auto &) { return false; });
}
for (auto iter = begin; iter != end; iter++) {
auto &waiter = iter->get();
assert(!entry->has_value());
auto result = co_await create(txn, parent, chainAllocCounter, waiter.req);
CO_RETURN_ON_TXN_ERROR(result);
if (result.hasError()) {
waiter.result = makeError(std::move(result.error()));
} else {
auto &[inode, entry] = *result;
waiter.result = CreateRsp(inode, false /* needTrunc */);
waiter.newFile = true;
co_return (co_await openExists(txn, inode, entry, std::next(iter), end)).then([](auto &) { return false; });
}
}
co_return false;
}
CoTryTask<std::pair<Inode, DirEntry>> BatchedOp::create(IReadWriteTransaction &txn,
const Inode &parent,
folly::Synchronized<uint32_t> &chainAllocCounter,
const CreateReq &req) {
CO_RETURN_ON_ERROR(req.valid());
auto parentId = inodeId_;
auto parentAcl = parent.acl;
const auto &name = req.path.path->string();
if (!parent.nlink) {
co_return makeError(MetaCode::kNotFound, fmt::format("{}, Directory {} is removed", req.path, parentId));
}
CO_RETURN_ON_ERROR(req.path.validForCreate());
CO_RETURN_ON_ERROR(parentAcl.checkPermission(req.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent.asDirectory().checkLock(req.client));
auto layout = req.layout;
if (!layout.has_value()) {
// user doesn't specific layout, inherit parent directory's layout.
layout = parent.asDirectory().layout;
}
if (!layout->empty()) {
CO_RETURN_ON_ERROR(co_await chainAlloc().checkLayoutValid(*layout));
} else {
if (parent.acl.iflags & FS_CHAIN_ALLOCATION_FL) {
CO_RETURN_ON_ERROR(co_await chainAlloc().allocateChainsForLayout(*layout, chainAllocCounter));
} else {
CO_RETURN_ON_ERROR(co_await chainAlloc().allocateChainsForLayout(*layout));
}
}
auto newChunkEngine = config().enable_new_chunk_engine() || (parent.acl.iflags & FS_NEW_CHUNK_ENGINE);
auto inodeId = co_await allocateInodeId(txn, newChunkEngine);
CO_RETURN_ON_ERROR(inodeId);
XLOGF_IF(FATAL,
inodeId->useNewChunkEngine() != newChunkEngine,
"InodeId {}, use new chunk engine {}",
inodeId,
newChunkEngine);
auto entry = DirEntry::newFile(parentId, name, *inodeId);
entry.uuid = req.uuid;
auto inode = Inode::newFile(*inodeId,
Acl(req.user.uid, req.user.gid, meta::Permission(req.perm & ALLPERMS)),
std::move(*layout),
now());
if (config().dynamic_stripe() && req.dynStripe) {
inode.asFile().dynStripe = std::min(config().dynamic_stripe_initial(), inode.asFile().layout.stripeSize);
}
if (parentAcl.perm & S_ISGID) {
// The set-group-ID bit (S_ISGID) has several special uses.
// For a directory, it indicates that BSD semantics are to be used for that directory:
// files created there inherit their group ID from the directory, not from the effective group ID of the creating
// process, and directories created there will also get the S_ISGID bit set
inode.acl.gid = parentAcl.gid;
}
// NOTE: add parent inode and dirEntry into read conflict set.
// add parent inode into read conflict set to prevent parent is removed concurrently
CO_RETURN_ON_ERROR(co_await Inode(parentId).addIntoReadConflict(txn));
// add directory entry into read conflict set to prevent concurrent create
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
// create inode and dirEntry
CO_RETURN_ON_ERROR(co_await entry.store(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
if (req.session && req.flags.accessType() != AccessType::READ) {
openWrite.addSample(1);
CO_RETURN_ON_ERROR(co_await FileSession::create(inode.id, req.session.value()).store(txn));
}
co_return std::make_pair(inode, entry);
}
CoTryTask<void> BatchedOp::openExists(IReadWriteTransaction &txn,
Inode &inode,
const DirEntry &entry,
auto begin,
auto end) {
bool dirty = false;
for (auto iter = begin; iter != end; iter++) {
auto &waiter = iter->get();
auto &req = waiter.req;
if (entry.uuid != Uuid::zero() && entry.uuid == req.uuid) {
// this may happens when FDB returns commit_unknown_result, or we failed to send response to client
XLOGF(CRITICAL, "Create already finished, dst {}, req {}, uuid {}", entry, req, req.uuid);
waiter.result = CreateRsp(inode, false /* trunc */);
continue;
}
auto result = co_await openExists(txn, inode, req);
CO_RETURN_ON_TXN_ERROR(result);
if (result.hasError()) {
waiter.result = makeError(std::move(result.error()));
} else {
waiter.result = CreateRsp(inode, waiter.req.flags.contains(O_TRUNC) /* needTrunc */);
dirty |= *result;
}
}
if (dirty) {
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
}
co_return Void{};
}
CoTryTask<bool> BatchedOp::openExists(IReadWriteTransaction &txn, Inode &inode, const CreateReq &req) {
CO_RETURN_ON_ERROR(req.valid());
if (inode.isSymlink()) {
// todo: rarely happens, how to handle this gracefully?
auto msg = fmt::format("req {}, found symlink {}", req, inode);
XLOG(WARN, msg);
co_return makeError(MetaCode::kBusy, std::move(msg));
}
if (!inode.isFile()) {
assert(inode.isDirectory());
co_return makeError(MetaCode::kIsDirectory);
}
if (req.flags.contains(O_EXCL)) {
co_return makeError(MetaCode::kExists);
}
// check permission
if (req.flags.accessType() != AccessType::READ && (inode.acl.iflags & FS_IMMUTABLE_FL)) {
co_return makeError(MetaCode::kNoPermission, fmt::format("FS_IMMUTABLE_FL set on inode {}", inode.id));
}
CO_RETURN_ON_ERROR(inode.acl.checkPermission(req.user, req.flags.accessType()));
// check hole
auto rdonly = req.flags.accessType() == AccessType::READ;
if (rdonly && inode.asFile().hasHole() && config().check_file_hole()) {
XLOGF(WARN, "Inode {} contains hole, don't allow O_RDONLY", inode.id);
co_return makeError(MetaCode::kFileHasHole);
}
auto dirty = false;
// clear SUID SGID sticky bits on write by non owner
constexpr uint32_t sbits = S_ISUID | S_ISGID | S_ISVTX;
static_assert(sbits == 07000);
if (!rdonly && req.user.uid != inode.acl.uid && (inode.acl.perm & sbits)) {
dirty |= SetAttr::update(inode.acl.perm, Permission(inode.acl.perm & (~sbits)));
}
// update dynamic stripe
if (req.session.has_value() && req.flags.accessType() != AccessType::READ) {
CO_RETURN_ON_ERROR(inode.acl.checkPermission(req.user, req.flags.accessType()));
if (!req.dynStripe && inode.asFile().dynStripe && inode.asFile().dynStripe < inode.asFile().layout.stripeSize) {
dirty |= SetAttr::update(inode.asFile().dynStripe, 0u);
}
}
// create session
if (req.session && req.flags.accessType() != AccessType::READ) {
openWrite.addSample(1);
CO_RETURN_ON_ERROR(co_await FileSession::create(inode.id, req.session.value()).store(txn));
}
co_return dirty;
}
void BatchedOp::retry(const Status &error) {
Operation<Inode>::retry(error);
for (auto &waiter : syncs_) {
waiter.get().result = std::nullopt;
}
for (auto &waiter : closes_) {
waiter.get().result = std::nullopt;
}
for (auto &waiter : setattrs_) {
waiter.get().result = std::nullopt;
}
for (auto &waiter : creates_) {
waiter.get().result = std::nullopt;
}
}
/** BatchedOp::Waiter */
template <>
void BatchedOp::Waiter<CreateReq, CreateRsp>::finish(BatchedOp &op, const Result<Inode> &r) {
SCOPE_EXIT { baton.post(); };
if (r.hasError() && !result.has_value()) {
result = makeError(r.error());
return;
}
XLOGF_IF(FATAL, !result.has_value(), "req {}, no result", req);
if (result->hasError()) {
return;
}
auto inode = result->value().stat;
if (newFile) {
op.addEvent(Event::Type::Create)
.addField("parent", req.path.parent)
.addField("name", req.path.path->string())
.addField("inode", inode.id)
.addField("user", req.user.uid)
.addField("host", req.client.hostname)
.addField("chain_table", inode.asFile().layout.tableId);
}
if (req.session && req.flags.accessType() != AccessType::READ) {
if (req.flags.contains(O_TRUNC) && !newFile) {
result.value()->needTruncate = true;
}
op.addEvent(Event::Type::OpenWrite)
.addField("inode", inode.id)
.addField("owner", inode.acl.uid)
.addField("user", req.user.uid)
.addField("host", req.client.hostname)
.addField("length", inode.asFile().length)
.addField("truncateVer", inode.asFile().truncateVer)
.addField("dynStripe", inode.asFile().dynStripe)
.addField("otrunc", req.flags.contains(O_TRUNC));
}
}
template <>
void BatchedOp::Waiter<SyncReq, SyncRsp>::finish(BatchedOp &op, const Result<Inode> &r) {
if (!result.has_value()) {
result = r.then([](auto &inode) { return SyncRsp(inode); });
}
if (req.truncated && !hasError()) {
auto &inode = result.value()->stat;
XLOGF_IF(DFATAL, !inode.isFile(), "req {} success, but inode {} is not file", req, inode);
if (inode.isFile()) {
op.addEvent(Event::Type::Truncate)
.addField("inode", inode.id)
.addField("length", inode.asFile().length)
.addField("truncateVer", inode.asFile().truncateVer)
.addField("dynStripe", inode.asFile().dynStripe)
.addField("user", req.user.uid)
.addField("host", req.client.hostname);
op.addTrace(MetaEventTrace{
.eventType = Event::Type::Truncate,
.inodeId = inode.id,
.userId = req.user.uid,
.client = req.client,
.length = inode.asFile().length,
.truncateVer = inode.asFile().truncateVer,
.dynStripe = inode.asFile().dynStripe,
});
}
}
baton.post();
}
template <>
void BatchedOp::Waiter<CloseReq, CloseRsp>::finish(BatchedOp &op, const Result<Inode> &r) {
if (!result.has_value()) {
result = r.then([](auto &inode) { return CloseRsp(inode); });
}
if (req.session && !hasError()) {
auto &inode = result.value()->stat;
XLOGF_IF(DFATAL, !inode.isFile(), "req {} success, but inode {} is not file", req, inode);
if (inode.isFile()) {
op.addEvent(Event::Type::CloseWrite)
.addField("inode", inode.id.toHexString())
.addField("owner", inode.acl.uid)
.addField("user", req.user.uid)
.addField("host", req.client.hostname)
.addField("length", inode.asFile().length)
.addField("truncateVer", inode.asFile().truncateVer)
.addField("dynStripe", inode.asFile().dynStripe)
.addField("prune", req.pruneSession);
op.addTrace(MetaEventTrace{
.eventType = Event::Type::CloseWrite,
.inodeId = inode.id,
.ownerId = inode.acl.uid,
.userId = req.user.uid,
.client = req.client,
.length = inode.asFile().length,
.truncateVer = inode.asFile().truncateVer,
.dynStripe = inode.asFile().dynStripe,
.pruneSession = req.pruneSession,
});
}
}
baton.post();
}
template <>
void BatchedOp::Waiter<SetAttrReq, SetAttrRsp>::finish(BatchedOp &, const Result<Inode> &r) {
if (!result.has_value()) {
result = r.then([](auto &inode) { return SetAttrRsp(inode); });
}
baton.post();
}
void BatchedOp::finish(const Result<Inode> &result) {
batchCnt.addSample(syncs_.size() + closes_.size() + setattrs_.size());
for (auto &waiter : syncs_) {
waiter.get().finish(*this, result);
}
for (auto &waiter : closes_) {
waiter.get().finish(*this, result);
}
for (auto &waiter : setattrs_) {
waiter.get().finish(*this, result);
}
for (auto &waiter : creates_) {
waiter.get().finish(*this, result);
}
Operation<Inode>::finish(result);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,177 @@
#pragma once
#include <atomic>
#include <cassert>
#include <folly/ScopeGuard.h>
#include <folly/Synchronized.h>
#include <folly/Utility.h>
#include <folly/experimental/coro/Baton.h>
#include <folly/logging/xlog.h>
#include <functional>
#include <map>
#include <memory>
#include <optional>
#include <type_traits>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/Shards.h"
#include "common/utils/UtcTime.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fbs/meta/Service.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
class MetaOperator;
class BatchedOp : public Operation<Inode> {
public:
template <typename Req, typename Rsp>
struct Waiter : folly::NonCopyableNonMovable {
Req req;
std::optional<Result<Rsp>> result;
folly::coro::Baton baton;
bool newFile = false; /* for Create operation */
Waiter(Req req)
: req(std::move(req)) {}
Result<Rsp> getResult() {
XLOGF_IF(FATAL, !result.has_value(), "result not set");
return *result;
}
bool hasError() const { return result.has_value() && result->hasError(); }
void finish(BatchedOp &op, const Result<Inode> &r);
};
BatchedOp(MetaStore &meta, InodeId inodeId)
: Operation(meta),
inodeId_(inodeId) {}
std::string_view name() const override { return "batchedOp"; }
flat::Uid user() const override { return user_; }
template <typename Req, typename Rsp>
void add(Waiter<Req, Rsp> &waiter);
template <>
void add(Waiter<SyncReq, SyncRsp> &waiter) {
XLOGF_IF(FATAL, waiter.req.inode != inodeId_, "{} != {}", waiter.req.inode, inodeId_);
addReq(syncs_, waiter);
}
template <>
void add(Waiter<CloseReq, CloseRsp> &waiter) {
XLOGF_IF(FATAL, waiter.req.inode != inodeId_, "{} != {}", waiter.req.inode, inodeId_);
addReq(closes_, waiter);
}
template <>
void add(Waiter<SetAttrReq, SetAttrRsp> &waiter) {
XLOGF_IF(FATAL, waiter.req.path != PathAt(inodeId_), "{} != {}", waiter.req.path, PathAt(inodeId_));
addReq(setattrs_, waiter);
}
template <>
void add(Waiter<CreateReq, CreateRsp> &waiter) {
XLOGF_IF(FATAL,
(waiter.req.path.parent != inodeId_ || !waiter.req.path.path || waiter.req.path.path->has_parent_path()),
"path {}, inodeId {}",
waiter.req.path,
inodeId_);
addReq(creates_, waiter);
}
CoTryTask<Inode> run(IReadWriteTransaction &txn) override;
void retry(const Status &error) override;
void finish(const Result<Inode> &result) override;
size_t numReqs() const { return numReqs_; }
// for test
static CoTryTask<CreateRsp> create(MetaStore &store, IReadWriteTransaction &txn, CreateReq req) {
Waiter<CreateReq, CreateRsp> waiter(req);
BatchedOp op(store, req.path.parent);
op.add(waiter);
op.finish(co_await op.run(txn));
co_return waiter.getResult();
}
private:
friend class MetaOperator;
template <typename Req, typename Rsp>
using WaiterRef = std::reference_wrapper<Waiter<Req, Rsp>>;
void addReq(auto &reqs, auto &waiter) {
if (!user_) {
user_ = waiter.req.user.uid;
}
reqs.emplace_back(waiter);
numReqs_++;
}
CoTryTask<bool> setAttr(IReadWriteTransaction &txn, Inode &inode);
CoTryTask<bool> syncAndClose(IReadWriteTransaction &txn, Inode &inode);
CoTryTask<bool> sync(Inode &inode,
const SyncReq &req,
bool &updateLength,
bool &truncate,
std::optional<VersionedLength> &hintLength);
CoTryTask<bool> close(Inode &inode,
const CloseReq &req,
bool &updateLength,
std::optional<VersionedLength> &hintLength,
std::vector<FileSession> &sessions);
CoTryTask<bool> create(IReadWriteTransaction &txn, Inode &inode);
CoTryTask<bool> create(IReadWriteTransaction &txn,
const Inode &parent,
folly::Synchronized<uint32_t> &chainAllocCounter,
auto begin,
auto end);
CoTryTask<std::pair<Inode, DirEntry>> create(IReadWriteTransaction &txn,
const Inode &parent,
folly::Synchronized<uint32_t> &chainAllocCounter,
const CreateReq &req);
CoTryTask<void> openExists(IReadWriteTransaction &txn, Inode &inode, const DirEntry &entry, auto begin, auto end);
CoTryTask<bool> openExists(IReadWriteTransaction &txn, Inode &inode, const CreateReq &req);
CoTryTask<VersionedLength> queryLength(const Inode &inode, std::optional<VersionedLength> hintLength, bool truncate);
// requests
InodeId inodeId_;
flat::Uid user_; // use first uid
std::vector<WaiterRef<SetAttrReq, SetAttrRsp>> setattrs_;
std::vector<WaiterRef<SyncReq, SyncRsp>> syncs_;
std::vector<WaiterRef<CloseReq, CloseRsp>> closes_;
std::vector<WaiterRef<CreateReq, CreateRsp>> creates_;
size_t numReqs_ = 0;
// state
std::optional<kv::Versionstamp> versionstamp_;
std::optional<VersionedLength> currLength_;
std::optional<VersionedLength> nextLength_;
};
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,104 @@
#include <folly/logging/xlog.h>
#include <memory>
#include <stack>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "meta/store/DirEntry.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
static Path simplifyPath(const Path &path) {
std::vector<Path> components;
bool absolute = false;
for (auto &name : path) {
if (name == ".") {
continue;
} else if (name == "..") {
if (!components.empty() && components.back() != "..") {
components.pop_back();
} else if (!absolute) {
components.push_back(name);
}
} else if (name == "/") {
absolute = true;
components.clear();
} else {
components.push_back(name);
}
}
Path p = absolute ? "/" : "";
for (auto &name : components) {
p = p / name;
}
XLOGF(DBG, "before {}, after {}", path, p);
return p.empty() ? "." : p;
}
/** MetaStore::getRealPath */
class GetRealPathOp : public ReadOnlyOperation<GetRealPathRsp> {
public:
GetRealPathOp(MetaStore &meta, const GetRealPathReq &req)
: ReadOnlyOperation<GetRealPathRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<GetRealPathRsp> run(IReadOnlyTransaction &txn) override {
XLOGF(DBG, "GetRealPathOp: {}", req_);
CHECK_REQUEST(req_);
DirEntry entry;
if (req_.path.path.has_value()) {
Path trace;
auto result = co_await resolve(txn, req_.user, &trace).dirEntry(req_.path, AtFlags(AT_SYMLINK_FOLLOW));
CO_RETURN_ON_ERROR(result);
if (!req_.absolute) {
co_return GetRealPathRsp(simplifyPath(trace));
}
entry = std::move(*result);
} else {
auto inode = (co_await Inode::snapshotLoad(txn, req_.path.parent)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(inode);
if (!inode->isDirectory()) {
co_return makeError(MetaCode::kNotDirectory, "Only support get absolute path of directory");
} else if (UNLIKELY(inode->id.isTreeRoot())) {
co_return GetRealPathRsp("/");
}
auto result = co_await inode->snapshotLoadDirEntry(txn);
CO_RETURN_ON_ERROR(result);
entry = std::move(*result);
}
Path path = entry.name;
while (!entry.parent.isTreeRoot()) {
auto parent = (co_await Inode::snapshotLoad(txn, entry.parent)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(parent);
auto result = co_await parent->snapshotLoadDirEntry(txn);
CO_RETURN_ON_ERROR(result);
entry = std::move(*result);
XLOGF(DBG, "get {}", entry.name);
path = entry.name / path;
}
co_return simplifyPath("/" / path);
}
private:
const GetRealPathReq &req_;
};
MetaStore::OpPtr<GetRealPathRsp> MetaStore::getRealPath(const GetRealPathReq &req) {
return std::make_unique<GetRealPathOp>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,131 @@
#include <algorithm>
#include <cassert>
#include <fcntl.h>
#include <fmt/core.h>
#include <folly/logging/xlog.h>
#include <limits>
#include <linux/fs.h>
#include <memory>
#include <utility>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Service.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/ops/SetAttr.h"
namespace hf3fs::meta::server {
class HardLinkOp : public Operation<HardLinkRsp> {
public:
HardLinkOp(MetaStore &meta, const HardLinkReq &req)
: Operation<HardLinkRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<HardLinkRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "HardLinkOp: {}", req_);
CHECK_REQUEST(req_);
// check name valid
CO_RETURN_ON_ERROR(req_.newPath.validForCreate());
auto resolveResult = co_await resolve(txn, req_.user).path(req_.newPath, req_.flags);
CO_RETURN_ON_ERROR(resolveResult);
if (resolveResult->dirEntry.has_value()) {
auto &entry = *resolveResult->dirEntry;
if (entry.uuid != Uuid::zero() && entry.uuid == req_.uuid) {
// this may happens when FDB returns commit_unknown_result, or we failed to send response to client
XLOGF(CRITICAL, "HardLink already created, dst {}, req {}, uuid {}", entry, req_, req_.uuid);
auto inode = co_await entry.snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return HardLinkRsp(std::move(*inode));
}
co_return makeError(MetaCode::kExists, fmt::format("hardlink exists, req {}, uuid {}", req_, entry));
}
auto target = co_await resolve(txn, req_.user).inode(req_.oldPath, req_.flags, true /* checkRefCnt */);
CO_RETURN_ON_ERROR(target);
auto &inode = *target;
// check permission and lock
auto parent = co_await resolveResult->getParentInode(txn);
CO_RETURN_ON_ERROR(parent);
CO_RETURN_ON_ERROR(parent->acl.checkPermission(req_.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent->asDirectory().checkLock(req_.client));
if (inode.acl.iflags & FS_IMMUTABLE_FL) {
co_return makeError(MetaCode::kNoPermission, fmt::format("FS_IMMUTABLE_FL set on target inode {}", inode.id));
}
assert(inode.nlink);
InodeId parentId = resolveResult->getParentId();
DirEntry entry;
switch (inode.getType()) {
case InodeType::File:
entry = DirEntry::newFile(parentId, req_.newPath.path->filename().native(), inode.id);
break;
case InodeType::Directory:
co_return makeError(MetaCode::kIsDirectory);
case InodeType::Symlink:
entry = DirEntry::newSymlink(parentId, req_.newPath.path->filename().native(), inode.id);
break;
default:
XLOGF(FATAL, "Found invalid inode type {}", (int)inode.getType());
}
entry.uuid = req_.uuid;
if (inode.nlink == std::numeric_limits<uint16_t>::max()) {
XLOGF(ERR, "Inode {} has {} links, can't add more hard link!", inode.id, inode.nlink);
co_return makeError(MetaCode::kNoPermission, "nlink == uint16_t::max");
}
// NOTE: create dirEntry, add parent inode and dirEntry into read conflict set.
// add parent inode into read conflict set to prevent parent is removed concurrently
CO_RETURN_ON_ERROR(co_await Inode(parentId).addIntoReadConflict(txn));
// add directory entry into read conflict set to prevent concurrent create
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await entry.store(txn));
// NOTE: add link count of inode, add inode into read conflict set since this is a read modify write
assert(!inode.isDirectory());
inode.nlink++;
SetAttr::update(inode.ctime, UtcClock::now(), config().time_granularity(), true);
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
addEvent(Event::Type::HardLink)
.addField("parent", entry.parent)
.addField("name", entry.name)
.addField("inode", inode.id)
.addField("owner", inode.acl.uid)
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname)
.addField("nlink", inode.nlink);
addTrace(MetaEventTrace{
.eventType = Event::Type::HardLink,
.inodeId = inode.id,
.parentId = entry.parent,
.entryName = entry.name,
.ownerId = inode.acl.uid,
.userId = req_.user.uid,
.client = req_.client,
.nlink = inode.nlink,
});
co_return HardLinkRsp(std::move(inode));
}
private:
const HardLinkReq &req_;
};
MetaStore::OpPtr<HardLinkRsp> MetaStore::hardLink(const HardLinkReq &req) {
return std::make_unique<HardLinkOp>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,68 @@
#include <fcntl.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <utility>
#include "common/kv/ITransaction.h"
#include "common/monitor/Recorder.h"
#include "common/monitor/Sample.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
namespace {
monitor::DistributionRecorder listDist("meta_server:list_entries");
}
/** MetaStore::list */
class ListOp : public ReadOnlyOperation<ListRsp> {
public:
ListOp(MetaStore &meta, const ListReq &req)
: ReadOnlyOperation<ListRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<ListRsp> run(IReadOnlyTransaction &txn) override {
XLOGF(DBG, "ListOp: {}", req_);
CHECK_REQUEST(req_);
auto result =
co_await resolve(txn, req_.user).inode(req_.path, AtFlags(AtFlags(AT_SYMLINK_FOLLOW)), true /* checkRefCnt */);
CO_RETURN_ON_ERROR(result);
auto &inode = result.value();
if (!inode.isDirectory()) {
co_return makeError(MetaCode::kNotDirectory);
}
if (!inode.acl.checkPermission(req_.user, AccessType::READ)) {
co_return makeError(MetaCode::kNoPermission);
}
auto list = co_await DirEntryList::snapshotLoad(txn,
inode.id,
req_.prev,
req_.limit > 0 ? req_.limit : config().list_default_limit(),
req_.status,
config().batch_stat_concurrent());
CO_RETURN_ON_ERROR(list);
listDist.addSample(list->entries.size(), {{"uid", folly::to<std::string>(req_.user.uid)}});
co_return ListRsp(std::move(*list));
}
private:
const ListReq &req_;
};
MetaStore::OpPtr<ListRsp> MetaStore::list(const ListReq &req) { return std::make_unique<ListOp>(*this, req); }
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,69 @@
#include <folly/logging/xlog.h>
#include <memory>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Schema.h"
#include "fbs/meta/Service.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
class LockDirectoryOp : public Operation<LockDirectoryRsp> {
public:
LockDirectoryOp(MetaStore &meta, const LockDirectoryReq &req)
: Operation<LockDirectoryRsp>(meta),
req_(req) {}
CoTryTask<LockDirectoryRsp> run(IReadWriteTransaction &txn) override {
CO_RETURN_ON_ERROR(req_.valid());
auto inode = (co_await Inode::load(txn, req_.inode)).then(checkMetaFound<Inode>);
CO_RETURN_ON_ERROR(inode);
CO_RETURN_ON_ERROR(inode->acl.checkPermission(req_.user, AccessType::WRITE));
if (!inode->isDirectory()) {
co_return makeError(MetaCode::kNotDirectory, fmt::format("{} is not directory", inode->id));
}
switch (req_.action) {
case LockDirectoryReq::LockAction::TryLock:
if (inode->asDirectory().lock && inode->asDirectory().lock->client.uuid != req_.client.uuid) {
co_return makeError(MetaCode::kNoLock, fmt::format("lock hold by {}", *inode->asDirectory().lock));
}
case LockDirectoryReq::LockAction::PreemptLock:
if (auto lock = Directory::Lock{req_.client}; inode->asDirectory().lock != lock) {
inode->asDirectory().lock = lock;
CO_RETURN_ON_ERROR(co_await inode->store(txn));
}
co_return LockDirectoryRsp();
case LockDirectoryReq::LockAction::UnLock:
if (!inode->asDirectory().lock) {
co_return makeError(MetaCode::kNoLock, "locked not owned");
}
if (inode->asDirectory().lock->client.uuid != req_.client.uuid) {
co_return makeError(MetaCode::kNoLock, fmt::format("lock hold by {}", *inode->asDirectory().lock));
}
case LockDirectoryReq::LockAction::Clear:
if (inode->asDirectory().lock) {
inode->asDirectory().lock = std::nullopt;
CO_RETURN_ON_ERROR(co_await inode->store(txn));
}
co_return LockDirectoryRsp();
default:
XLOGF(DFATAL, "invalid action {}", (int)req_.action);
co_return makeError(MetaCode::kFoundBug);
}
}
private:
const LockDirectoryReq &req_;
};
MetaStore::OpPtr<LockDirectoryRsp> MetaStore::lockDirectory(const LockDirectoryReq &req) {
return std::make_unique<LockDirectoryOp>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,136 @@
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include <iterator>
#include <memory>
#include <optional>
#include <sys/stat.h>
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#include "common/utils/Uuid.h"
#include "fbs/meta/Common.h"
#include "meta/store/DirEntry.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
/** MetaStore::mkdirs */
class MkdirsOp : public Operation<MkdirsRsp> {
public:
MkdirsOp(MetaStore &meta, const MkdirsReq &req)
: Operation<MkdirsRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<MkdirsRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "MkdirsOp: {}", req_);
CHECK_REQUEST(req_);
auto resolveResult = co_await resolve(txn, req_.user).pathRange(req_.path);
CO_RETURN_ON_ERROR(resolveResult);
auto curr = resolveResult->missing.begin();
const auto end = resolveResult->missing.end();
if (curr == end) {
assert(resolveResult->dirEntry.has_value());
if (resolveResult->dirEntry->uuid != Uuid::zero() && resolveResult->dirEntry->uuid == req_.uuid) {
// this may happens when FDB returns commit_unknown_result, or we failed to send response to client
XLOGF(CRITICAL, "Mkdirs already finished, dst {}, req {}, uuid {}", *resolveResult->dirEntry, req_, req_.uuid);
auto inode = co_await resolveResult->dirEntry->snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return MkdirsRsp(std::move(*inode));
}
co_return makeError(MetaCode::kExists);
}
if (std::distance(curr, end) != 1 && !req_.recursive) {
// some middle path components are missing and not recursive mkdirs
co_return makeError(MetaCode::kNotFound);
}
auto parent = co_await resolveResult->getParentInode(txn);
CO_RETURN_ON_ERROR(parent);
CO_RETURN_ON_ERROR(parent->acl.checkPermission(req_.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent->asDirectory().checkLock(req_.client));
auto layout = req_.layout;
if (!layout.has_value()) {
// user doesn't specific layout, inherit parent directory's layout.
layout = parent->asDirectory().layout;
}
InodeId parentId = resolveResult->getParentId();
// NOTE: add parent inode and dirEntry into read conflict set.
// add parent inode into read conflict set to prevent parent is removed concurrently
CO_RETURN_ON_ERROR(co_await Inode(parentId).addIntoReadConflict(txn));
// add directory entry into read conflict set to prevent concurrent create
CO_RETURN_ON_ERROR(co_await DirEntry(parentId, curr->native()).addIntoReadConflict(txn));
auto acl = Acl(req_.user.uid,
req_.user.gid,
Permission(req_.perm & ALLPERMS),
IFlags(parent->acl.iflags & FS_FL_INHERITABLE));
if (parent->acl.perm & S_ISGID) {
// The set-group-ID bit (S_ISGID) has several special uses.
// For a directory, it indicates that BSD semantics are to be used for that directory:
// files created there inherit their group ID from the directory, not from the effective group ID of the creating
// process, and directories created there will also get the S_ISGID bit set
acl.gid = parent->acl.gid;
acl.perm = Permission(acl.perm | S_ISGID);
}
// create all path components
FAULT_INJECTION_SET_FACTOR(std::distance(curr, end));
while (true) {
if (!curr->has_filename() || curr->filename_is_dot() || curr->filename_is_dot_dot()) {
co_return makeError(StatusCode::kInvalidArg, "filename is '.' or '..'");
}
auto inodeId = co_await allocateInodeId(txn, false);
CO_RETURN_ON_ERROR(inodeId);
CO_RETURN_ON_ERROR(co_await chainAlloc().checkLayoutValid(layout.value()));
Inode inode = Inode::newDirectory(*inodeId, parentId, curr->filename().native(), acl, *layout, now());
DirEntry entry = DirEntry::newDirectory(parentId, curr->native(), *inodeId, acl);
entry.uuid = req_.uuid;
// create inode and dirEntry
CO_RETURN_ON_ERROR(co_await entry.store(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
addEvent(Event::Type::Mkdir)
.addField("parent", entry.parent)
.addField("name", entry.name)
.addField("inode", inode.id)
.addField("user", inode.acl.uid)
.addField("host", req_.client.hostname)
.addField("chain_table", inode.asDirectory().layout.tableId);
addTrace(MetaEventTrace{
.eventType = Event::Type::Mkdir,
.inodeId = inode.id,
.parentId = entry.parent,
.entryName = entry.name,
.userId = inode.acl.uid,
.client = req_.client,
.tableId = inode.asDirectory().layout.tableId,
});
curr++;
parentId = *inodeId;
if (curr == end) {
co_return MkdirsRsp(std::move(inode));
}
}
}
private:
const MkdirsReq &req_;
};
MetaStore::OpPtr<MkdirsRsp> MetaStore::mkdirs(const MkdirsReq &req) { return std::make_unique<MkdirsOp>(*this, req); }
} // namespace hf3fs::meta::server

308
src/meta/store/ops/Open.cc Normal file
View File

@@ -0,0 +1,308 @@
#include <cassert>
#include <fcntl.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include "common/kv/ITransaction.h"
#include "common/utils/Result.h"
#include "fbs/meta/Common.h"
#include "meta/components/GcManager.h"
#include "meta/components/SessionManager.h"
#include "meta/store/DirEntry.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/PathResolve.h"
#include "meta/store/Utils.h"
#define BEGIN_WRITE() \
if (this->isReadOnly()) { \
auto msg = fmt::format("Op {}{} shouldn't be readonly!", MetaSerde<>::getRpcName(req_), req_); \
XLOG(DFATAL, msg); \
co_return makeError(MetaCode::kFoundBug, std::move(msg)); \
} \
auto &rwTxn = dynamic_cast<IReadWriteTransaction &>(txn);
namespace hf3fs::meta::server {
monitor::CountRecorder openWrite("meta_server.open_write");
/** MetaStore::open */
template <typename Req, typename Rsp>
class OpenOp : public Operation<Rsp> {
public:
OpenOp(MetaStore &meta, Req &req)
: Operation<Rsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
bool isReadOnly() final {
return !req_.session.has_value() && req_.flags.accessType() == AccessType::READ && !req_.flags.contains(O_TRUNC) &&
!req_.flags.contains(O_CREAT);
}
CoTryTask<Rsp> run(IReadWriteTransaction &txn) final {
XLOGF(DBG, "OpenOp: {}", req_);
CHECK_REQUEST(req_);
if (!req_.path.path.has_value()) {
// open by inodeId
auto inode =
co_await this->resolve(txn, req_.user)
.inode(req_.path, AtFlags(AT_SYMLINK_FOLLOW) /* open/create follow symlink*/, true /* checkRefCnt */);
CO_RETURN_ON_ERROR(inode);
co_return co_await openExists(txn, std::nullopt, std::move(*inode), this->config().check_file_hole());
} else {
// open by path, can handle O_TRUNC by replace inode here.
auto resolveResult = co_await this->resolve(txn, req_.user).path(req_.path, AtFlags(AT_SYMLINK_FOLLOW));
CO_RETURN_ON_ERROR(resolveResult);
auto &entry = resolveResult->dirEntry;
if (!entry.has_value()) {
if constexpr (std::is_same_v<Req, CreateReq>) {
req_.path = PathAt(resolveResult->getParentId(), req_.path.path->filename());
}
co_return makeError(MetaCode::kNotFound);
}
assert(!entry->isSymlink());
auto inode = co_await entry->snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return co_await openExists(txn, std::move(*entry), std::move(*inode), this->config().check_file_hole());
}
}
CoTryTask<Rsp> openExists(IReadOnlyTransaction &txn, std::optional<DirEntry> entry, Inode inode, bool checkHole) {
XLOGF(DBG, "inode {}", inode);
assert(!entry.has_value() || inode.id == entry->id);
if (prevCreatedInodeId_ == inode.id) {
// this inode is created by us, just return here.
if (entry.has_value()) {
addCreateEvent(*entry, inode);
}
co_return Rsp(std::move(inode), false);
}
if (req_.flags.contains(O_EXCL)) {
co_return makeError(MetaCode::kExists);
}
switch (inode.getType()) {
case InodeType::Directory:
co_return co_await openExistsDirectory(txn, inode);
case InodeType::File:
co_return co_await openExistsFile(txn, entry, inode, checkHole);
default:
XLOGF(FATAL, "inode {} invalid type {}", inode, (int)inode.getType());
}
}
CoTryTask<Rsp> openExistsDirectory(IReadOnlyTransaction &txn, Inode &inode) {
XLOGF_IF(FATAL, !inode.isDirectory(), "Inode {} is not directory", inode);
if (req_.flags.accessType() != AccessType::READ || req_.flags.contains(O_TRUNC) || std::is_same_v<Req, CreateReq>) {
co_return makeError(MetaCode::kIsDirectory);
}
CO_RETURN_ON_ERROR(inode.acl.checkPermission(req_.user, req_.flags.accessType()));
co_return Rsp(std::move(inode), false);
}
CoTryTask<Rsp> openExistsFile(IReadOnlyTransaction &txn,
std::optional<DirEntry> &entry,
Inode &inode,
bool checkHole) {
XLOGF_IF(FATAL, !inode.isFile(), "Inode {} is not file", inode);
bool dirty = false;
// check permission
if (req_.flags.contains(O_DIRECTORY)) {
co_return makeError(MetaCode::kNotDirectory);
}
if (req_.flags.accessType() != AccessType::READ && (inode.acl.iflags & FS_IMMUTABLE_FL)) {
co_return makeError(MetaCode::kNoPermission, fmt::format("FS_IMMUTABLE_FL set on inode {}", inode.id));
}
CO_RETURN_ON_ERROR(inode.acl.checkPermission(req_.user, req_.flags.accessType()));
// check hole
auto rdonly = req_.flags.accessType() == AccessType::READ;
if (rdonly && inode.asFile().hasHole() && checkHole) {
XLOGF(WARN, "Inode {} contains hole, don't allow O_RDONLY", inode.id);
co_return makeError(MetaCode::kFileHasHole);
}
// handle otrunc
bool otrunc = req_.flags.contains(O_TRUNC);
XLOGF(DBG, "inode {}, otrunc {}", inode, otrunc);
if (otrunc && entry.has_value()) {
BEGIN_WRITE();
auto replaced = co_await replaceExistsFile(rwTxn, *entry, inode);
CO_RETURN_ON_ERROR(replaced);
if (*replaced) {
co_return Rsp(std::move(inode), false);
}
}
// clear SUID SGID sticky bits on write by non owner
constexpr uint32_t sbits = S_ISUID | S_ISGID | S_ISVTX;
static_assert(sbits == 07000);
if (!rdonly && req_.user.uid != inode.acl.uid && (inode.acl.perm & sbits)) {
inode.acl.perm = Permission(inode.acl.perm & (~sbits));
dirty = true;
}
if (req_.session.has_value() && req_.flags.accessType() != AccessType::READ) {
BEGIN_WRITE();
CO_RETURN_ON_ERROR(co_await createSession(rwTxn, inode, req_.flags));
if (!req_.dynStripe && inode.asFile().dynStripe && inode.asFile().dynStripe < inode.asFile().layout.stripeSize) {
inode.asFile().dynStripe = 0;
dirty = true;
}
}
if (dirty) {
BEGIN_WRITE();
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(rwTxn));
CO_RETURN_ON_ERROR(co_await inode.store(rwTxn));
}
XLOGF(DBG, "inode {}, otrunc {}", inode, otrunc);
co_return Rsp(std::move(inode), otrunc);
}
CoTryTask<bool> replaceExistsFile(kv::IReadWriteTransaction &txn, DirEntry &entry, Inode &inode) {
XLOGF(DBG, "Try to replace file {} on O_TRUNC", entry);
assert(inode.isFile());
if (!this->config().otrunc_replace_file() || inode.nlink != 1 ||
inode.asFile().length < this->config().otrunc_replace_file_threshold()) {
XLOGF(DBG,
"Can't replace {}, enable replace {}, nlink {}, size {}",
entry,
this->config().otrunc_replace_file(),
inode.nlink,
inode.asFile().length);
co_return false;
}
auto checkResult = co_await FileSession::checkExists(txn, inode.id);
CO_RETURN_ON_ERROR(checkResult);
if (*checkResult) {
XLOGF(DBG, "Can't replace {}, has session", entry);
co_return false;
}
XLOGF(DBG, "Replace {} with a new inode", entry);
auto old = inode;
CO_RETURN_ON_ERROR(co_await this->gcManager().removeEntry(txn, entry, old, GcInfo{req_.user.uid, entry.name}));
// create new entry and inode
auto inodeId = co_await this->allocateInodeId(txn, false);
CO_RETURN_ON_ERROR(inodeId);
entry = DirEntry::newFile(entry.parent, std::string(entry.name), *inodeId);
inode = Inode::newFile(*inodeId, inode.acl, inode.asFile().layout, this->now());
if (this->config().dynamic_stripe() && req_.dynStripe) {
inode.asFile().dynStripe = std::min(this->config().dynamic_stripe_initial(), inode.asFile().layout.stripeSize);
}
CO_RETURN_ON_ERROR(co_await createInodeAndEntry(txn, entry, inode, old));
CO_RETURN_ON_ERROR(co_await createSession(txn, inode, req_.flags));
co_return true;
}
CoTryTask<Void> createInodeAndEntry(IReadWriteTransaction &txn,
DirEntry &entry,
Inode &inode,
std::optional<Inode> old = std::nullopt) {
auto parentId = entry.parent;
auto inodeId = entry.id;
assert(inode.id == inodeId);
// NOTE: add parent inode and dirEntry into read conflict set.
// add parent inode into read conflict set to prevent parent is removed concurrently
CO_RETURN_ON_ERROR(co_await Inode(parentId).addIntoReadConflict(txn));
// add directory entry into read conflict set to prevent concurrent create
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
// create inode and dirEntry
CO_RETURN_ON_ERROR(co_await entry.store(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
prevCreatedInodeId_ = inodeId;
addCreateEvent(entry, inode, old);
co_return Void{};
}
CoTryTask<Void> createSession(IReadWriteTransaction &txn, Inode &inode, OpenFlags oflags) {
if (!inode.isFile()) {
assert(false);
co_return makeError(MetaCode::kNotFile);
}
if (!req_.session.has_value()) {
co_return Void{};
}
openWrite.addSample(1);
auto session = FileSession::create(inode.id, req_.session.value());
CO_RETURN_ON_ERROR(co_await session.store(txn));
this->addEvent(Event::Type::OpenWrite)
.addField("inode", inode.id)
.addField("owner", inode.acl.uid)
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname)
.addField("length", inode.asFile().length)
.addField("truncateVer", inode.asFile().truncateVer)
.addField("dynStripe", inode.asFile().dynStripe)
.addField("otrunc", oflags.contains(O_TRUNC));
this->addTrace(MetaEventTrace{
.eventType = Event::Type::OpenWrite,
.inodeId = inode.id,
.ownerId = inode.acl.uid,
.userId = req_.user.uid,
.client = req_.client,
.length = inode.asFile().length,
.truncateVer = inode.asFile().truncateVer,
.dynStripe = inode.asFile().dynStripe,
.oflags = oflags,
});
co_return Void{};
}
void addCreateEvent(const DirEntry &entry, const Inode &inode, std::optional<Inode> old = std::nullopt) {
XLOGF_IF(DFATAL, (old.has_value() && !old->isFile()), "old {} is not file", *old);
auto &event = this->addEvent(Event::Type::Create)
.addField("parent", entry.parent)
.addField("name", entry.name)
.addField("inode", entry.id)
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname)
.addField("chain_table", inode.asFile().layout.tableId);
if (old && old->isFile()) {
event.addField("old_inode", old->id).addField("old_length", old->asFile().length);
}
this->addTrace(MetaEventTrace{
.eventType = Event::Type::Create,
.inodeId = entry.id,
.parentId = entry.parent,
.entryName = entry.name,
.userId = req_.user.uid,
.client = req_.client,
.tableId = inode.asFile().layout.tableId,
});
}
private:
Req &req_;
std::optional<InodeId> prevCreatedInodeId_;
};
MetaStore::OpPtr<OpenRsp> MetaStore::open(OpenReq &req) {
return std::make_unique<OpenOp<OpenReq, OpenRsp>>(*this, req);
}
MetaStore::OpPtr<CreateRsp> MetaStore::tryOpen(CreateReq &req) {
return std::make_unique<OpenOp<CreateReq, CreateRsp>>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,74 @@
#include <algorithm>
#include <folly/Likely.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include <utility>
#include <vector>
#include "client/meta/MetaClient.h"
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/StatusCode.h"
#include "meta/components/FileHelper.h"
#include "meta/components/SessionManager.h"
#include "meta/store/FileSession.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
/** MetaStore::pruneSession */
class PruneSessionOp : public Operation<PruneSessionRsp> {
public:
PruneSessionOp(MetaStore &meta, const PruneSessionReq &req)
: Operation<PruneSessionRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<PruneSessionRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "PruneSessionOp::run, req {}", req_);
CHECK_REQUEST(req_);
static constexpr size_t kConcurrentCheck = 32;
std::vector<CoTryTask<void>> tasks;
auto waitRequests = [&]() -> CoTryTask<void> {
auto results = co_await folly::coro::collectAllRange(std::exchange(tasks, {}));
for (auto &rsp : results) {
CO_RETURN_ON_ERROR(rsp);
}
co_return Void{};
};
for (size_t i = 0; i < req_.sessions.size(); i++) {
auto sessionId = req_.sessions[i];
tasks.push_back(prune(txn, sessionId));
if (tasks.size() == kConcurrentCheck || i + 1 >= req_.sessions.size()) {
CO_RETURN_ON_ERROR(co_await waitRequests());
}
}
co_return PruneSessionRsp();
}
private:
CoTryTask<void> prune(IReadWriteTransaction &txn, const Uuid sessionId) {
auto session = FileSession::createPrune(req_.client, sessionId);
CO_RETURN_ON_ERROR(co_await session.store(txn));
co_return Void{};
}
const PruneSessionReq &req_;
};
MetaStore::OpPtr<PruneSessionRsp> MetaStore::pruneSession(const PruneSessionReq &req) {
return std::make_unique<PruneSessionOp>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,200 @@
#include <algorithm>
#include <cassert>
#include <fcntl.h>
#include <fmt/core.h>
#include <folly/Likely.h>
#include <folly/ScopeGuard.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/CurrentExecutor.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/fibers/BatchSemaphore.h>
#include <folly/fibers/Semaphore.h>
#include <folly/futures/Future.h>
#include <folly/logging/xlog.h>
#include <iostream>
#include <limits>
#include <linux/fs.h>
#include <memory>
#include <optional>
#include <queue>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/kv/WithTransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fdb/FDBRetryStrategy.h"
#include "meta/components/GcManager.h"
#include "meta/components/SessionManager.h"
#include "meta/event/Event.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/PathResolve.h"
namespace hf3fs::meta::server {
/** Remove. */
class RemoveOp : public Operation<RemoveRsp> {
public:
RemoveOp(MetaStore &meta, const RemoveReq &req)
: Operation<RemoveRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
bool needIdempotent(Uuid &clientId, Uuid &requestId) const override {
if (!req_.checkUuid()) {
return false;
}
if (req_.recursive || config().idempotent_remove()) {
clientId = req_.client.uuid;
requestId = req_.uuid;
return true;
}
return false;
}
CoTryTask<RemoveRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "RemoveOp: {}", req_);
CHECK_REQUEST(req_);
Result<PathResolveOp::ResolveResult> resolveResult = makeError(MetaCode::kFoundBug);
if (req_.path.path.has_value()) {
resolveResult = co_await resolve(txn, req_.user)
.path(req_.path, AtFlags(AT_SYMLINK_NOFOLLOW) /* remove shouldn't follow symlink */);
} else {
resolveResult = co_await resolve(txn, req_.user).byDirectoryInodeId(req_.path.parent);
}
CO_RETURN_ON_ERROR(resolveResult);
if (!resolveResult->dirEntry.has_value()) {
co_return makeError(MetaCode::kNotFound);
} else if (resolveResult->dirEntry->id.isTreeRoot()) {
// don't permit remove root
co_return MAKE_ERROR_F(MetaCode::kNoPermission, "Can't remove tree root {}", *resolveResult->dirEntry);
}
// check src InodeId
if (req_.inodeId && resolveResult->dirEntry->id != req_.inodeId) {
co_return MAKE_ERROR_F(MetaCode::kNotFound, "remove {}, inodeId != {}", *resolveResult->dirEntry, *req_.inodeId);
}
// check permission, must have write permission to parent directory, and not locked
auto parent = co_await resolveResult->getParentInode(txn);
CO_RETURN_ON_ERROR(parent);
CO_RETURN_ON_ERROR(parent->acl.checkPermission(req_.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent->asDirectory().checkLock(req_.client));
auto &entry = resolveResult->dirEntry.value();
if (req_.checkType) {
if (req_.atFlags.contains(AT_REMOVEDIR) && entry.isFile()) {
co_return makeError(MetaCode::kNotDirectory);
}
if (!req_.atFlags.contains(AT_REMOVEDIR) && entry.isDirectory()) {
co_return makeError(MetaCode::kIsDirectory);
}
}
// The sticky bit (S_ISVTX) on a directory means that a file in that directory can be renamed or deleted
// only by the owner of the file, by the owner of the directory, and by a privileged process.
auto loadInodeResult = co_await entry.snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(loadInodeResult);
auto &inode = *loadInodeResult;
if ((parent->acl.perm & S_ISVTX) && req_.user.uid != parent->acl.uid && !req_.user.isRoot() &&
req_.user.uid != inode.acl.uid) {
auto msg = fmt::format("can't remove {}, S_ISVTX set on parent {} {}", entry, parent->id, parent->acl);
XLOG(DBG, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
if (inode.acl.iflags & FS_IMMUTABLE_FL) {
auto msg = fmt::format("can't remove {}, FS_IMMUTABLE_FL set on inode", entry);
XLOG(DBG, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
auto type = std::string(magic_enum::enum_name(inode.getType()));
folly::toLowerAscii(type);
auto event = Event(Event::Type::Remove);
event.addField("parent", entry.parent)
.addField("name", entry.name)
.addField("inode", entry.id)
.addField("type", type)
.addField("owner", entry.dirAcl->uid)
.addField("nlink", inode.nlink - 1)
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname);
auto trace = MetaEventTrace{.eventType = Event::Type::Remove,
.inodeId = entry.id,
.parentId = entry.parent,
.entryName = entry.name,
.ownerId = inode.acl.uid,
.userId = req_.user.uid,
.client = req_.client,
.inodeType = inode.getType(),
.nlink = inode.nlink,
.recursiveRemove = req_.recursive};
auto gcInfo = GcInfo{req_.user.uid, entry.name};
if (entry.isDirectory()) {
event.addField("recursive", req_.recursive);
auto result = co_await DirEntryList::checkEmpty(txn, entry.id);
CO_RETURN_ON_ERROR(result);
if (auto empty = *result; empty) {
XLOGF_IF(DFATAL, inode.nlink != 1, "Directory {} nlink != 1", inode);
// remove directory directly
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await entry.remove(txn));
CO_RETURN_ON_ERROR(co_await inode.remove(txn));
addEvent(event);
addTrace(std::move(trace));
co_return RemoveRsp{};
}
if (!req_.recursive) {
co_return makeError(MetaCode::kNotEmpty);
}
CO_RETURN_ON_ERROR(inode.acl.checkRecursiveRmPerm(req_.user, config().recursive_remove_check_owner()));
auto recursiveCheck = config().recursive_remove_perm_check();
if (recursiveCheck) {
auto res = co_await DirEntryList::recursiveCheckRmPerm(txn, inode.id, req_.user, recursiveCheck, 128);
CO_RETURN_ON_ERROR(res);
}
// recursive remove, save original path
auto ancestors = std::vector<Inode>();
CO_RETURN_ON_ERROR(co_await Inode::loadAncestors(txn, ancestors, entry.parent));
for (auto &ancestor : ancestors) {
gcInfo.origPath = ancestor.asDirectory().name / gcInfo.origPath;
}
event.addField("origPath", gcInfo.origPath.string());
trace.origPath = gcInfo.origPath;
}
auto result = co_await gcManager().removeEntry(txn, entry, inode, gcInfo);
CO_RETURN_ON_ERROR(result);
addEvent(event);
addTrace(std::move(trace));
co_return RemoveRsp{};
}
private:
const RemoveReq &req_;
};
MetaStore::OpPtr<RemoveRsp> MetaStore::remove(const RemoveReq &req) { return std::make_unique<RemoveOp>(*this, req); }
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,369 @@
#include <algorithm>
#include <cassert>
#include <fcntl.h>
#include <folly/Likely.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/logging/xlog.h>
#include <linux/fs.h>
#include <memory>
#include <optional>
#include <set>
#include <string>
#include <sys/stat.h>
#include <utility>
#include <vector>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/FaultInjection.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/core/user/User.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "meta/components/GcManager.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/PathResolve.h"
#include "meta/store/Utils.h"
#include "meta/store/ops/SetAttr.h"
namespace hf3fs::meta::server {
/** MetaStore::rename */
/**
* Note: rename operation in POSIX and HDFS has different semantic when destination exists.
* In POSIX, if destination is a file or empty directory, it will be replaced automatilally (special case:
* set RENAME_NOREPLACE flags in renameat2).
* In HDFS,
* - if destination is a file, rename operation will raise FileAlreadyExistsException;
* - if destination is a directory and source is file, source will be moved under destination (eg: mv file dir ->
* dir/file);
* - if both source and destination are directories, all children of source will be moved under destination recursively
* (we have decided to not provide this semantic because it's too complicated).
*
* This function implements POSIX semantic.
*/
class RenameOp : public Operation<RenameRsp> {
public:
RenameOp(MetaStore &meta, const RenameReq &req)
: Operation<RenameRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
static bool underTrash(const std::vector<Inode> &ancestors) {
return ancestors.size() >= 2 && ancestors[ancestors.size() - 1].id == InodeId::root() &&
ancestors[ancestors.size() - 2].asDirectory().name == "trash";
}
bool needIdempotent(Uuid &clientId, Uuid &requestId) const override {
if (!req_.checkUuid()) return false;
if (!req_.moveToTrash && !config().idempotent_rename()) return false;
clientId = req_.client.uuid;
requestId = req_.uuid;
return true;
}
CoTryTask<Void> checkLoop(IReadWriteTransaction &txn,
const PathResolveOp::ResolveResult &srcResult,
const PathResolveOp::ResolveResult &dstResult,
std::optional<Path> &origPath) {
auto dstAncestors = std::vector<Inode>();
CO_RETURN_ON_ERROR(co_await Inode::loadAncestors(txn, dstAncestors, dstResult.getParentId()));
assert(!dstAncestors.empty());
for (auto &ancestor : dstAncestors) {
// src is not dst's ancestor
if (ancestor.id == srcResult.dirEntry->id) {
// try to move directory into it's descendent
co_return makeError(StatusCode::kInvalidArg, "try to move directory into it's descendent");
}
// move into a deleted directory
if (ancestor.nlink == 0) {
co_return makeError(MetaCode::kNotFound);
}
// check root
if (ancestor.id == ancestor.asDirectory().parent) {
if (ancestor.id == InodeId::root()) {
break;
} else if (ancestor.id == InodeId::gcRoot()) {
XLOGF(ERR, "RenameOp: {} move directory into a removed directory", req_);
co_return makeError(MetaCode::kNoPermission);
} else {
XLOGF(DFATAL, "Inode {} parent is itself", ancestor);
co_return makeError(MetaCode::kFoundBug);
}
}
}
if (underTrash(dstAncestors)) {
XLOGF_IF(FATAL, !srcResult.dirEntry->isDirectory(), "{} not directory", *srcResult.dirEntry);
auto srcAncestors = std::vector<Inode>();
CO_RETURN_ON_ERROR(co_await Inode::loadAncestors(txn, srcAncestors, srcResult.getParentId()));
if (req_.moveToTrash || config().allow_directly_move_to_trash()) {
auto acl = srcResult.dirEntry->dirAcl;
if (!acl) {
XLOGF(DFATAL, "DirEntry {} is directory, but don't have acl", *srcResult.dirEntry);
co_return makeError(MetaCode::kFoundBug);
}
// try to move a directory into trash directory, should be owner and have rwx permission
CO_RETURN_ON_ERROR(acl->checkRecursiveRmPerm(req_.user, config().recursive_remove_check_owner()));
auto recursiveCheck = config().recursive_remove_perm_check();
if (recursiveCheck) {
auto res =
co_await DirEntryList::recursiveCheckRmPerm(txn, srcResult.dirEntry->id, req_.user, recursiveCheck, 128);
CO_RETURN_ON_ERROR(res);
}
} else if (req_.user.uid != flat::Uid(0)) {
// src should already in trash
if (!underTrash(srcAncestors)) {
co_return makeError(MetaCode::kNoPermission, "try to move into trash directory without moveToTrash");
}
}
origPath = Path(srcResult.dirEntry->name);
for (auto &ancestor : srcAncestors) {
origPath = ancestor.asDirectory().name / *origPath;
}
}
co_return Void{};
}
CoTryTask<Void> snapshotLoadInode(IReadWriteTransaction &txn, const DirEntry &entry, std::optional<Inode> &inode) {
if (!inode.has_value()) {
auto result = co_await entry.snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(result);
inode = std::move(*result);
}
co_return Void{};
}
CoTryTask<Void> checkPermission(IReadWriteTransaction &txn,
PathResolveOp::ResolveResult &resolve,
std::optional<Inode> &inode,
bool dst) {
auto parent = co_await resolve.getParentInode(txn);
CO_RETURN_ON_ERROR(parent);
CO_RETURN_ON_ERROR(parent->acl.checkPermission(req_.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent->asDirectory().checkLock(req_.client));
if (dst && !parent->nlink) {
// can't rename into a removed directory
co_return makeError(MetaCode::kNotFound);
}
if (!resolve.dirEntry.has_value()) {
co_return Void{};
}
auto &entry = *resolve.dirEntry;
CO_RETURN_ON_ERROR(co_await snapshotLoadInode(txn, entry, inode));
if (inode->acl.iflags & FS_IMMUTABLE_FL) {
auto msg = fmt::format("rename can't move {}, FS_IMMUTABLE_FL set on inode", entry);
XLOG(DBG, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
// The sticky bit (S_ISVTX) on a directory means that a file in that directory can be renamed or deleted
// only by the owner of the file, by the owner of the directory, and by a privileged process.
if ((parent->acl.perm & S_ISVTX) && req_.user.uid != parent->acl.uid && !req_.user.isRoot()) {
// not owner of directory and not owner of privileged process, should be owner of file
if (req_.user.uid != inode->acl.uid) {
auto msg = fmt::format("rename can't move {} {}, S_ISVTX set on parent {} {}",
entry,
inode->acl,
resolve.getParentId(),
parent->acl);
XLOG(DBG, msg);
co_return makeError(MetaCode::kNoPermission, msg);
}
}
co_return Void{};
}
CoTryTask<std::optional<std::pair<InodeId, uint16_t>>> removeDst(IReadWriteTransaction &txn,
PathResolveOp::ResolveResult &dst,
std::optional<Inode> &dstInode) {
if (!dst.dirEntry.has_value()) {
co_return std::nullopt;
}
assert(dst.dirEntry->name == req_.dest.path->filename().native());
if (dst.dirEntry->isFile()) {
// let GC task free file chunks.
CO_RETURN_ON_ERROR(co_await snapshotLoadInode(txn, *dst.dirEntry, dstInode));
CO_RETURN_ON_ERROR(
co_await gcManager().removeEntry(txn, *dst.dirEntry, *dstInode, GcInfo{req_.user.uid, dst.dirEntry->name}));
assert(dstInode->id == dst.dirEntry->id);
co_return std::pair<InodeId, uint16_t>{dstInode->id, dstInode->nlink};
} else if (dst.dirEntry->isDirectory()) {
// empty directory, can remove Inode directly
CO_RETURN_ON_ERROR(co_await Inode(dst.dirEntry->id).remove(txn));
co_return std::pair<InodeId, uint16_t>{dst.dirEntry->id, 0};
} else {
XLOGF_IF(DFATAL, !dst.dirEntry->isSymlink(), "{} not symlink, shouldn't happen", *dst.dirEntry);
// need load inode and check refcnt
auto inode = co_await dst.dirEntry->loadInode(txn);
CO_RETURN_ON_ERROR(inode);
if (UNLIKELY(inode->nlink == 0)) {
auto msg = fmt::format("entry {} exists, but inode {} nlink == 0", *dst.dirEntry, inode);
XLOG(DFATAL, msg);
co_return makeError(MetaCode::kFoundBug, msg);
}
// NOTE: The fuse client may have cached this symlink. If delete it immediately, kNotFound will be reported for
// subsequent visits. The temporary solution is not to delete the symlink inode. This problem needs to be resolved
// later.
SetAttr::update(inode->ctime, UtcClock::now(), config().time_granularity(), true);
auto refcnt = --inode->nlink;
CO_RETURN_ON_ERROR(co_await inode->store(txn));
// if (refcnt != 0) {
// CO_RETURN_ON_ERROR(co_await inode->store(txn));
// } else {
// CO_RETURN_ON_ERROR(co_await inode->remove(txn));
// }
co_return std::pair<InodeId, uint16_t>{dst.dirEntry->id, refcnt};
}
}
CoTryTask<RenameRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "RenameOp: {}", req_);
CHECK_REQUEST(req_);
auto [srcResult, dstResult] =
co_await folly::coro::collectAll(resolve(txn, req_.user).path(req_.src, AtFlags(AT_SYMLINK_NOFOLLOW)),
resolve(txn, req_.user).path(req_.dest, AtFlags(AT_SYMLINK_NOFOLLOW)));
CO_RETURN_ON_ERROR(srcResult);
CO_RETURN_ON_ERROR(dstResult);
// check dst, transaction may already executed.
if (dstResult->dirEntry.has_value() && dstResult->dirEntry->uuid != Uuid::zero() &&
dstResult->dirEntry->uuid == req_.uuid) {
// this may happens when FDB returns commit_unknown_result, or we failed to send response to client
XLOGF(CRITICAL, "Rename already finished, dst {}, req {}, uuid {}", *dstResult->dirEntry, req_, req_.uuid);
auto inode = co_await dstResult->dirEntry->snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return RenameRsp(std::move(*inode));
}
// src should exists
if (!srcResult->dirEntry.has_value()) {
co_return MAKE_ERROR_F(MetaCode::kNotFound, "rename src {} not found", req_.src);
}
// check src InodeId
if (req_.inodeId && srcResult->dirEntry->id != req_.inodeId) {
co_return MAKE_ERROR_F(MetaCode::kNotFound, "rename src {}, inodeId != {}", *srcResult->dirEntry, *req_.inodeId);
}
// if src and dst points to same dir entry, do nothing
if (dstResult->dirEntry.has_value() && dstResult->dirEntry->parent == srcResult->dirEntry->parent &&
dstResult->dirEntry->name == srcResult->dirEntry->name) {
auto inode = co_await dstResult->dirEntry->snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return RenameRsp(std::move(*inode));
}
// move to trash shouldn't replace file already exists
if (dstResult->dirEntry.has_value() && dstResult->dirEntry->isFile() && req_.moveToTrash) {
co_return MAKE_ERROR_F(MetaCode::kExists, "rename dest {} exist", req_.dest);
}
// dst shouldn't be a non-empty directory
if (dstResult->dirEntry.has_value() && dstResult->dirEntry->isDirectory()) {
auto checkResult = co_await DirEntryList::checkEmpty(txn, dstResult->dirEntry->id);
CO_RETURN_ON_ERROR(checkResult);
bool empty = checkResult.value();
if (!empty) {
co_return MAKE_ERROR_F(MetaCode::kNotEmpty, "rename dest {} not empty", req_.dest);
}
}
// now, dst can be safely replaced (not exist, empty directory, file, symlink).
std::optional<Path> origPath;
if (srcResult->dirEntry->isDirectory()) {
if (dstResult->dirEntry.has_value() && !dstResult->dirEntry->isDirectory()) {
// man 2 rename: oldpath can specify a directory. In this case, newpath must either not exist, or it must
// specify an empty directory.
co_return makeError(MetaCode::kNotDirectory);
}
CO_RETURN_ON_ERROR(co_await checkLoop(txn, *srcResult, *dstResult, origPath));
}
// permission check
std::optional<Inode> srcInode, dstInode;
CO_RETURN_ON_ERROR(co_await checkPermission(txn, *srcResult, srcInode, false));
CO_RETURN_ON_ERROR(co_await checkPermission(txn, *dstResult, dstInode, true));
// NOTE: add src/dst's parent inode and dirEntry into read conflict set.
CO_RETURN_ON_ERROR(co_await Inode(srcResult->getParentId()).addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await srcResult->dirEntry->addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await Inode(dstResult->getParentId()).addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(
co_await DirEntry(dstResult->getParentId(), req_.dest.path->filename().native()).addIntoReadConflict(txn));
auto &srcEntry = srcResult->dirEntry.value();
auto inodeResult = co_await srcEntry.loadInode(txn);
CO_RETURN_ON_ERROR(inodeResult);
auto &inode = inodeResult.value();
if (srcEntry.isDirectory()) {
// NOTE: add src's inode into read conflict set.
// load inode and update it's parent, read modify write, should use load.
inode.asDirectory().parent = dstResult->getParentId();
inode.asDirectory().name = req_.dest.path->filename().native();
auto updateInodeResult = co_await inode.store(txn);
CO_RETURN_ON_ERROR(updateInodeResult);
}
// remove src entry and dst entry
CO_RETURN_ON_ERROR(co_await srcEntry.remove(txn));
auto removeDstResult = co_await removeDst(txn, *dstResult, dstInode);
CO_RETURN_ON_ERROR(removeDstResult);
auto &oldDst = *removeDstResult;
// create dst entry
DirEntry newDstEntry(dstResult->getParentId(), req_.dest.path->filename().native());
newDstEntry.data() = srcEntry.data();
newDstEntry.uuid = req_.uuid;
CO_RETURN_ON_ERROR(co_await newDstEntry.store(txn));
auto &event = addEvent(Event::Type::Rename)
.addField("srcParent", srcEntry.parent)
.addField("srcName", srcEntry.name)
.addField("dstParent", newDstEntry.parent)
.addField("dstName", newDstEntry.name)
.addField("inode", newDstEntry.id)
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname);
addTrace(MetaEventTrace{.eventType = Event::Type::Rename,
.inodeId = newDstEntry.id,
.parentId = srcEntry.parent,
.entryName = srcEntry.name,
.dstParentId = newDstEntry.parent,
.dstEntryName = newDstEntry.name,
.userId = req_.user.uid,
.client = req_.client,
.origPath = origPath.value_or(Path())});
if (oldDst.has_value()) {
auto [oldDstInode, oldDstNlink] = *oldDst;
event.addField("oldDstInode", oldDstInode).addField("oldDstNlink", oldDstNlink);
}
if (origPath.has_value()) {
event.addField("origPath", origPath->string());
}
co_return RenameRsp(std::move(inode));
}
private:
const RenameReq &req_;
};
MetaStore::OpPtr<RenameRsp> MetaStore::rename(const RenameReq &req) { return std::make_unique<RenameOp>(*this, req); }
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,99 @@
#include "SetAttr.h"
#include <fcntl.h>
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include <optional>
#include <sys/stat.h>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "meta/store/DirEntry.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
#include "meta/store/PathResolve.h"
namespace hf3fs::meta::server {
class SetAttrOp : public Operation<SetAttrRsp> {
public:
SetAttrOp(MetaStore &meta, const SetAttrReq &req_)
: Operation<SetAttrRsp>(meta),
req_(req_) {}
OPERATION_TAGS(req_);
CoTryTask<SetAttrRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "SetAttrOp: {}", req_);
Inode inode;
std::optional<DirEntry> entry;
if (req_.path.path.has_value()) {
auto dirEntryResult = co_await resolve(txn, req_.user)
.dirEntry(req_.path, req_.flags | AT_SYMLINK_FOLLOW /* folly symlink by default*/);
CO_RETURN_ON_ERROR(dirEntryResult);
entry = std::move(*dirEntryResult);
auto inodeResult = co_await entry->snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inodeResult);
inode = std::move(*inodeResult);
} else {
auto statResult =
co_await resolve(txn, req_.user).inode(req_.path, req_.flags | AT_SYMLINK_FOLLOW, true /* checkRefCnt */);
CO_RETURN_ON_ERROR(statResult);
inode = std::move(*statResult);
}
auto dirty = false;
auto oldAcl = inode.acl;
CO_RETURN_ON_ERROR(SetAttr::check(inode, req_, config()));
dirty |= SetAttr::apply(inode, req_, config().time_granularity(), config().dynamic_stripe_growth());
if (inode.isDirectory() && inode.acl != oldAcl && inode.id != InodeId::root()) {
XLOGF_IF(FATAL, !dirty, "acl changed but dirty not set, {} != {}", inode.acl, oldAcl);
if (!entry.has_value() || entry->name == "." || entry->name == "..") {
auto result = co_await inode.snapshotLoadDirEntry(txn);
CO_RETURN_ON_ERROR(result);
entry = std::move(*result);
if (inode.asDirectory().name.empty()) {
inode.asDirectory().name = entry->name;
dirty = true;
}
}
XLOGF_IF(DFATAL, entry->name != inode.asDirectory().name, "{} != {}", entry->name, inode.asDirectory().name);
entry->dirAcl = inode.acl;
CO_RETURN_ON_ERROR(co_await entry->addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await entry->store(txn));
}
if (dirty) {
// NOTE: add inode into read conflict set
CO_RETURN_ON_ERROR(co_await inode.addIntoReadConflict(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
}
co_return SetAttrRsp(std::move(inode));
}
void finish(const Result<SetAttrRsp> &result) override {
Operation<SetAttrRsp>::finish(result);
if (!result.hasError()) {
if (req_.uid || req_.gid || req_.perm || req_.iflags) aclCache().invalid(result->stat.id);
}
}
private:
const SetAttrReq &req_;
};
MetaStore::OpPtr<SetAttrRsp> MetaStore::setAttr(const SetAttrReq &req_) {
return std::make_unique<SetAttrOp>(*this, req_);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,182 @@
#include <algorithm>
#include <cmath>
#include <compare>
#include <folly/ScopeGuard.h>
#include <folly/lang/Ordering.h>
#include <folly/logging/xlog.h>
#include <linux/fs.h>
#include <memory>
#include <optional>
#include <type_traits>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/meta/Schema.h"
#include "fbs/meta/Service.h"
#include "meta/base/Config.h"
#include "meta/service/MetaOperator.h"
#include "meta/store/Inode.h"
#include "meta/store/Utils.h"
namespace hf3fs::meta::server {
class SetAttr {
public:
static Result<Void> check(const Inode &inode, const SetAttrReq &req, const Config &config) {
RETURN_ON_ERROR(req.valid());
// permission check for setPermission
if (inode.id.isTreeRoot() && (req.perm || req.uid || req.gid)) {
XLOGF(WARN, "Don't allow change permission of tree root {}!", inode.id);
return makeError(MetaCode::kNoPermission, fmt::format("Don't allow change permission of {}", inode.id));
}
if (req.iflags.has_value() && *req.iflags != inode.acl.iflags) {
auto setChainAllocation = !(inode.acl.iflags & FS_CHAIN_ALLOCATION_FL) && (*req.iflags & FS_CHAIN_ALLOCATION_FL);
if (setChainAllocation && !config.iflags_chain_allocation()) {
return makeError(MetaCode::kNoPermission, "FS_CHAIN_ALLOCATION_FL disabled");
}
auto setNewChunkEngine = !(inode.acl.iflags & FS_NEW_CHUNK_ENGINE) && (*req.iflags & FS_NEW_CHUNK_ENGINE);
if (setNewChunkEngine && !config.iflags_chunk_engine()) {
return makeError(MetaCode::kNoPermission, "FS_NEW_CHUNK_ENGINE disabled");
}
auto changed = *req.iflags ^ inode.acl.iflags;
auto ownerChangeable = config.allow_owner_change_immutable() ? (uint32_t)(FS_HUGE_FILE_FL | FS_IMMUTABLE_FL)
: (uint32_t)(FS_HUGE_FILE_FL);
auto permCheck = req.user.isRoot() || (req.user.uid == inode.acl.uid && changed == (changed & ownerChangeable));
if (!permCheck) {
// NOTE: only allow root user set inode flags, file owner can use chattr +/- i, or set FS_HUGE_FILE_FL
return makeError(MetaCode::kNoPermission, "only root can set iflags");
}
}
if (req.perm.has_value() && *req.perm != inode.acl.perm && !req.user.isRoot() && req.user.uid != inode.acl.uid) {
// man 2 chmod: The effective UID of the calling process must match the owner of the file, or the process must be
// privileged (Linux: it must have the CAP_FOWNER capability).
return makeError(MetaCode::kNoPermission, "no perm to set perm");
}
if (req.uid.has_value() && *req.uid != inode.acl.uid && !req.user.isRoot()) {
// Only a privileged process (Linux: one with the CAP_CHOWN capability) may change the owner of a file.
return makeError(MetaCode::kNoPermission, "no perm to set uid");
}
if (req.gid.has_value() && *req.gid != inode.acl.gid && !req.user.isRoot() &&
(req.user.uid != inode.acl.uid || !req.user.inGroup(req.gid.value()))) {
// The owner of a file may change the group of the file to any group of which that owner is a member. A
// privileged process (Linux: with CAP_CHOWN) may change the group arbitrarily.
return makeError(MetaCode::kNoPermission, "no perm to set gid");
}
// permission check for utimes
// To set both file timestamps to the current time (i.e., times is NULL, or both tv_nsec fields specify UTIME_NOW),
// either:
// 1. the caller must have write access to the file;
// 2. the caller's effective user ID must match the owner of the file; or
// 3. the caller must have appropriate privileges.
// NOTE: we use UtcTime(0) as UTIME_NOW
auto cond1 = inode.acl.checkPermission(req.user, AccessType::WRITE).hasValue();
auto cond2 = req.user.uid == inode.acl.uid;
auto cond3 = req.user.isRoot();
if (req.atime || req.mtime) {
if ((req.atime && req.atime != SETATTR_TIME_NOW) || (req.mtime && req.mtime != SETATTR_TIME_NOW)) {
// To make any change other than setting both timestamps to the current time (i.e., times is not NULL, and
// neither tv_nsec field is UTIME_NOW and neither tv_nsec field is UTIME_OMIT), either condition 2 or 3 above
// must apply.
if (!cond2 && !cond3) {
return makeError(MetaCode::kNoPermission);
}
} else {
if (!cond1 && !cond2 && !cond3) {
return makeError(MetaCode::kNoPermission);
}
}
}
// permission check for setLayout
if (req.layout) {
if (!inode.isDirectory()) {
return makeError(MetaCode::kNotDirectory, "setLayout but not directory");
}
RETURN_ON_ERROR(inode.acl.checkPermission(req.user, AccessType::WRITE));
}
if (!inode.isFile() && req.dynStripe) {
return makeError(MetaCode::kNotFile, "extend dynStripe but not file");
}
return Void{};
}
static bool apply(Inode &inode, const SetAttrReq &req, Duration resolution, uint32_t stripeGrowth) {
// now we can do update.
bool dirty = false;
// setPermission
dirty |= update(inode.acl.iflags, req.iflags);
dirty |= update(inode.acl.uid, req.uid);
dirty |= update(inode.acl.gid, req.gid);
dirty |= update(inode.acl.perm, req.perm);
// setLayout
if (req.layout.has_value()) {
dirty |= update(inode.asDirectory().layout, req.layout);
}
if (dirty) {
update(inode.ctime, SETATTR_TIME_NOW, resolution, true /* cmp */);
}
// utimes
dirty |= update(inode.atime, req.atime, resolution, false /* cmp */);
dirty |= update(inode.mtime, req.mtime, resolution, false /* cmp */);
// extend
if (req.dynStripe && inode.asFile().dynStripe && inode.asFile().dynStripe < req.dynStripe) {
XLOGF_IF(FATAL, !inode.isFile(), "inode {} is not file", inode);
auto growth = std::max(2u, stripeGrowth);
auto dynStripe = inode.asFile().dynStripe;
while (dynStripe < std::min(req.dynStripe, inode.asFile().layout.stripeSize)) {
dynStripe = std::min(dynStripe * growth, inode.asFile().layout.stripeSize);
}
dirty |= update(inode.asFile().dynStripe, dynStripe);
}
return dirty;
}
static bool update(UtcTime &v, std::optional<UtcTime> nv, Duration resolution, bool cmp) {
if (!nv) {
return false;
}
if (*nv == SETATTR_TIME_NOW) {
nv = UtcClock::now();
}
nv = nv->castGranularity(resolution);
if (*nv != v && (!cmp || (*nv > v))) {
v = *nv;
return true;
} else {
return false;
}
}
template <typename T>
static bool update(T &v, std::optional<T> nv) {
static_assert(!std::is_same_v<T, UtcTime>);
if (nv.has_value() && nv != v) {
v = *nv;
return true;
} else {
return false;
}
}
template <typename T>
static bool update(T &v, T nv) {
if (nv != v) {
v = nv;
return true;
} else {
return false;
}
}
};
} // namespace hf3fs::meta::server

154
src/meta/store/ops/Stat.cc Normal file
View File

@@ -0,0 +1,154 @@
#include <algorithm>
#include <fcntl.h>
#include <folly/Likely.h>
#include <folly/Unit.h>
#include <folly/logging/xlog.h>
#include <memory>
#include <optional>
#include <type_traits>
#include "common/monitor/Recorder.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "common/utils/StatusCode.h"
#include "fbs/meta/Common.h"
#include "fbs/meta/Service.h"
#include "fbs/meta/Utils.h"
#include "meta/store/BatchContext.h"
#include "meta/store/Inode.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
namespace {
monitor::CountRecorder statFile("meta_server.stat_file");
monitor::CountRecorder statDir("meta_server.stat_dir");
monitor::CountRecorder statSymlink("meta_server.stat_symlink");
} // namespace
/** MetaStore::stat */
class StatOp : public ReadOnlyOperation<StatRsp> {
public:
StatOp(MetaStore &meta, const StatReq &req)
: ReadOnlyOperation<StatRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<StatRsp> run(IReadOnlyTransaction &txn) override {
XLOGF(DBG, "StatOp::run, req {}", req_);
CHECK_REQUEST(req_);
auto stat = co_await resolve(txn, req_.user)
.inode(req_.path, req_.flags, !config().allow_stat_deleted_inodes() /* checkRefCnt */);
CO_RETURN_ON_ERROR(stat);
switch (stat->getType()) {
case InodeType::File:
statFile.addSample(1);
break;
case InodeType::Directory:
statDir.addSample(1);
break;
case InodeType::Symlink:
statSymlink.addSample(1);
break;
}
co_return StatRsp(std::move(*stat));
}
private:
const StatReq &req_;
};
template <typename Req, typename Rsp>
class BatchStatOp : public ReadOnlyOperation<Rsp> {
public:
BatchStatOp(MetaStore &meta, const Req &req)
: ReadOnlyOperation<Rsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
auto &vector() {
if constexpr (std::is_same_v<Req, BatchStatReq>) {
return req_.inodeIds;
} else {
return req_.paths;
}
}
auto createBatchContext() {
if constexpr (std::is_same_v<Req, BatchStatReq>) {
return folly::Unit{};
} else {
return BatchContext::create();
}
}
CoTryTask<Inode> resolve(IReadOnlyTransaction &txn, const PathAt &path) {
co_return co_await ReadOnlyOperation<Rsp>::resolve(txn, req_.user)
.inode(path, req_.flags, !this->config().allow_stat_deleted_inodes() /* checkRefCnt */);
}
CoTryTask<Rsp> run(IReadOnlyTransaction &txn) override {
XLOGF(DBG, "BatchStatOp::run, req {}", req_);
CHECK_REQUEST(req_);
static constexpr auto byPath = !std::is_same_v<Req, BatchStatReq>;
using ResultType = std::conditional_t<byPath, Result<meta::Inode>, std::optional<meta::Inode>>;
using TaskResultType = std::conditional_t<byPath, Result<Inode>, Result<std::optional<Inode>>>;
size_t concurrent =
std::max(1u, !byPath ? this->config().batch_stat_concurrent() : this->config().batch_stat_by_path_concurrent());
auto exec = co_await folly::coro::co_current_executor;
[[maybe_unused]] auto guard = createBatchContext();
std::vector<ResultType> inodes;
auto iter = vector().begin();
while (iter != vector().end()) {
std::vector<folly::SemiFuture<TaskResultType>> tasks;
while (iter != vector().end() && tasks.size() < concurrent) {
if constexpr (!byPath) {
tasks.push_back(Inode::snapshotLoad(txn, *iter).scheduleOn(exec).start());
} else {
static_assert(std::is_same_v<Req, BatchStatByPathReq>);
tasks.push_back(resolve(txn, *iter).scheduleOn(exec).start());
}
iter++;
}
auto results = co_await folly::coro::collectAllRange(std::move(tasks));
for (auto result : results) {
if (result.hasError() && (!byPath || !ErrorHandling::success(result))) {
XLOGF(INFO, "batch stat error {}", result.error());
CO_RETURN_ERROR(result);
}
if constexpr (byPath) {
inodes.push_back(result);
} else {
inodes.push_back(*result);
}
}
}
co_return Rsp(std::move(inodes));
}
private:
const Req &req_;
};
MetaStore::OpPtr<StatRsp> MetaStore::stat(const StatReq &req) { return std::make_unique<StatOp>(*this, req); }
MetaStore::OpPtr<BatchStatRsp> MetaStore::batchStat(const BatchStatReq &req) {
return std::make_unique<BatchStatOp<BatchStatReq, BatchStatRsp>>(*this, req);
}
MetaStore::OpPtr<BatchStatByPathRsp> MetaStore::batchStatByPath(const BatchStatByPathReq &req) {
return std::make_unique<BatchStatOp<BatchStatByPathReq, BatchStatByPathRsp>>(*this, req);
}
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,32 @@
#include <chrono>
#include <folly/logging/xlog.h>
#include <memory>
#include "common/kv/ITransaction.h"
#include "common/utils/Result.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
/** MetaStore::statFs */
class StatFsOp : public ReadOnlyOperation<StatFsRsp> {
public:
StatFsOp(MetaStore &meta, const StatFsReq &req)
: ReadOnlyOperation<StatFsRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<StatFsRsp> run(IReadOnlyTransaction &) override {
XLOGF(DBG, "StatFsOp::run {}", req_);
co_return co_await fileHelper().statFs(req_.user, std::chrono::seconds(30));
}
private:
const StatFsReq &req_;
};
MetaStore::OpPtr<StatFsRsp> MetaStore::statFs(const StatFsReq &req) { return std::make_unique<StatFsOp>(*this, req); }
} // namespace hf3fs::meta::server

View File

@@ -0,0 +1,94 @@
#include <cassert>
#include <fcntl.h>
#include <folly/logging/xlog.h>
#include <memory>
#include "common/kv/ITransaction.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Result.h"
#include "meta/event/Event.h"
#include "meta/store/MetaStore.h"
#include "meta/store/Operation.h"
namespace hf3fs::meta::server {
/** MetaStore::symlink */
class SymlinkOp : public Operation<SymlinkRsp> {
public:
SymlinkOp(MetaStore &meta, const SymlinkReq &req)
: Operation<SymlinkRsp>(meta),
req_(req) {}
OPERATION_TAGS(req_);
CoTryTask<SymlinkRsp> run(IReadWriteTransaction &txn) override {
XLOGF(DBG, "SymlinkOp {}", req_);
CHECK_REQUEST(req_);
auto resolveResult = co_await resolve(txn, req_.user).path(req_.path, AtFlags(AT_SYMLINK_NOFOLLOW));
CO_RETURN_ON_ERROR(resolveResult);
if (resolveResult->dirEntry.has_value()) {
auto &entry = *resolveResult->dirEntry;
if (entry.uuid != Uuid::zero() && entry.uuid == req_.uuid) {
// this may happens when FDB returns commit_unknown_result, or we failed to send response to client
XLOGF(CRITICAL, "Symlink already created, dst {}, req {}, uuid {}", entry, req_, req_.uuid);
auto inode = co_await entry.snapshotLoadInode(txn);
CO_RETURN_ON_ERROR(inode);
co_return SymlinkRsp(std::move(*inode));
}
co_return makeError(MetaCode::kExists);
}
// check permission and lock
auto parent = co_await resolveResult->getParentInode(txn);
CO_RETURN_ON_ERROR(parent);
CO_RETURN_ON_ERROR(parent->acl.checkPermission(req_.user, AccessType::WRITE));
CO_RETURN_ON_ERROR(parent->asDirectory().checkLock(req_.client));
auto inodeId = co_await allocateInodeId(txn, false);
CO_RETURN_ON_ERROR(inodeId);
assert(req_.path.path.has_value());
InodeId parentId = resolveResult->getParentId();
DirEntry entry = DirEntry::newSymlink(parentId, req_.path.path->filename().native(), *inodeId);
entry.uuid = req_.uuid;
Inode inode = Inode::newSymlink(*inodeId, req_.target, req_.user.uid, req_.user.gid, now());
// NOTE: add parent inode and dirEntry into read conflict set.
// add parent inode into read conflict set to prevent parent is removed concurrently
CO_RETURN_ON_ERROR(co_await Inode(parentId).addIntoReadConflict(txn));
// add directory entry into read conflict set to prevent concurrent create
CO_RETURN_ON_ERROR(co_await entry.addIntoReadConflict(txn));
// create inode and dirEntry
CO_RETURN_ON_ERROR(co_await entry.store(txn));
CO_RETURN_ON_ERROR(co_await inode.store(txn));
addEvent(Event::Type::Symlink)
.addField("parent", entry.parent)
.addField("name", entry.name)
.addField("target", inode.asSymlink().target.native())
.addField("user", req_.user.uid)
.addField("host", req_.client.hostname);
addTrace(MetaEventTrace{
.eventType = Event::Type::Symlink,
.parentId = entry.parent,
.entryName = entry.name,
.userId = req_.user.uid,
.client = req_.client,
.symLinkTarget = inode.asSymlink().target,
});
co_return SymlinkRsp(std::move(inode));
}
private:
const SymlinkReq &req_;
};
MetaStore::OpPtr<SymlinkRsp> MetaStore::symlink(const SymlinkReq &req) {
return std::make_unique<SymlinkOp>(*this, req);
}
} // namespace hf3fs::meta::server