mirror of
https://github.com/deepseek-ai/3FS
synced 2025-06-26 18:16:45 +00:00
Initial commit
This commit is contained in:
5
src/storage/CMakeLists.txt
Normal file
5
src/storage/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
add_crate(chunk_engine)
|
||||
|
||||
target_add_lib(storage core-app core-service memory-common storage-fbs mgmtd-client storage-client kv analytics aio chunk_engine profiler)
|
||||
target_include_directories(storage PUBLIC ${CMAKE_SOURCE_DIR}/third_party/leveldb)
|
||||
target_add_bin(storage_main "storage.cpp" storage jemalloc)
|
||||
96
src/storage/aio/AioReadWorker.cc
Normal file
96
src/storage/aio/AioReadWorker.cc
Normal file
@@ -0,0 +1,96 @@
|
||||
#include "storage/aio/AioReadWorker.h"
|
||||
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
#include <folly/system/ThreadName.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "storage/aio/AioStatus.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::LatencyRecorder batchReadInQueueRecorder{"storage.batch_read_in_queue.latency"};
|
||||
monitor::CountRecorder aioRunningThreadsCount{"storage.aio_running_threads.count", std::nullopt, false};
|
||||
|
||||
AioReadWorker::~AioReadWorker() { stopAndJoin(); }
|
||||
|
||||
Result<Void> AioReadWorker::start(const std::vector<int> &fds, const std::vector<struct iovec> &iovecs) {
|
||||
uint32_t numThreads = config_.num_threads();
|
||||
for (auto i = 0u; i < numThreads; ++i) {
|
||||
executors_.add([&]() {
|
||||
AioStatus aioStatus;
|
||||
IoUringStatus ioUringStatus;
|
||||
{
|
||||
SCOPE_EXIT { ++initialized_; };
|
||||
auto aioInitResult = aioStatus.init(config_.max_events());
|
||||
if (UNLIKELY(!aioInitResult)) {
|
||||
XLOGF(ERR, "aio status init failed: {}", aioInitResult.error());
|
||||
*initResult_.lock() = std::move(aioInitResult);
|
||||
return;
|
||||
}
|
||||
if (config_.enable_io_uring()) {
|
||||
auto ioUringResult = ioUringStatus.init(config_.max_events(), fds, iovecs);
|
||||
if (UNLIKELY(!ioUringResult)) {
|
||||
XLOGF(ERR, "io uring status init failed: {}", ioUringResult.error());
|
||||
*initResult_.lock() = std::move(ioUringResult);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
run(aioStatus, ioUringStatus);
|
||||
});
|
||||
}
|
||||
for (int i = 0; initialized_ != numThreads; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for AioReadWorker@{}::run start...", fmt::ptr(this));
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(*initResult_.lock());
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> AioReadWorker::stopAndJoin() {
|
||||
for (auto i = 0u; i < config_.num_threads(); ++i) {
|
||||
queue_.enqueue(AioReadJobIterator{});
|
||||
}
|
||||
executors_.join();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> AioReadWorker::run(AioStatus &aioStatus, IoUringStatus &ioUringStatus) {
|
||||
aioRunningThreadsCount.addSample(1);
|
||||
auto guard = folly::makeGuard([] { aioRunningThreadsCount.addSample(-1); });
|
||||
|
||||
while (true) {
|
||||
// 1. try to fetch a batch read job.
|
||||
aioRunningThreadsCount.addSample(-1);
|
||||
auto it = queue_.dequeue(); // waiting.
|
||||
aioRunningThreadsCount.addSample(1);
|
||||
if (it.isNull()) {
|
||||
XLOGF(DBG, "Stop AioReadWorker {}...", fmt::ptr(this));
|
||||
return Void{};
|
||||
}
|
||||
batchReadInQueueRecorder.addSample(RelativeTime::now() - it.startTime());
|
||||
it->batch().resetStartTime();
|
||||
|
||||
IoStatus &status = config_.useIoUring() ? static_cast<IoStatus &>(ioUringStatus) : aioStatus;
|
||||
status.setAioReadJobIterator(it);
|
||||
|
||||
do {
|
||||
// 2. collect a batch of read jobs.
|
||||
status.collect();
|
||||
|
||||
// 3. submit a batch of read jobs.
|
||||
status.submit();
|
||||
|
||||
// 4. wait a batch of events.
|
||||
while (status.inflight()) {
|
||||
status.reap(config_.min_complete());
|
||||
};
|
||||
} while (status.hasUnfinishedBatchReadJob());
|
||||
}
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
78
src/storage/aio/AioReadWorker.h
Normal file
78
src/storage/aio/AioReadWorker.h
Normal file
@@ -0,0 +1,78 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <folly/Random.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "common/utils/BoundedQueue.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "storage/aio/AioStatus.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class AioReadWorker {
|
||||
public:
|
||||
enum class IoEngine {
|
||||
libaio,
|
||||
io_uring,
|
||||
random,
|
||||
};
|
||||
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(num_threads, 32ul);
|
||||
CONFIG_ITEM(queue_size, 4096u);
|
||||
CONFIG_ITEM(max_events, 512u);
|
||||
CONFIG_ITEM(enable_io_uring, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(min_complete, 128u);
|
||||
CONFIG_HOT_UPDATED_ITEM(wait_all_inflight, false); // deprecated.
|
||||
CONFIG_HOT_UPDATED_ITEM(inflight_control_offset, 128); // deprecated.
|
||||
CONFIG_HOT_UPDATED_ITEM(ioengine, IoEngine::libaio);
|
||||
|
||||
public:
|
||||
inline bool useIoUring() const {
|
||||
if (!enable_io_uring()) {
|
||||
return false;
|
||||
}
|
||||
switch (ioengine()) {
|
||||
case IoEngine::io_uring:
|
||||
return true;
|
||||
case IoEngine::libaio:
|
||||
return false;
|
||||
case IoEngine::random:
|
||||
return folly::Random::rand32() & 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
AioReadWorker(const Config &config)
|
||||
: config_(config),
|
||||
queue_(config.queue_size()),
|
||||
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
|
||||
std::make_shared<folly::NamedThreadFactory>("AioRead")) {}
|
||||
~AioReadWorker();
|
||||
|
||||
CoTask<void> enqueue(AioReadJobIterator job) { co_await queue_.co_enqueue(job); }
|
||||
|
||||
Result<Void> start(const std::vector<int> &fds, const std::vector<struct iovec> &iovecs);
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
protected:
|
||||
Result<Void> run(AioStatus &aioStatus, IoUringStatus &ioUringStatus);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::AioReadWorker"> constructLog_;
|
||||
const Config &config_;
|
||||
BoundedQueue<AioReadJobIterator> queue_;
|
||||
|
||||
folly::CPUThreadPoolExecutor executors_;
|
||||
std::atomic<uint32_t> initialized_{};
|
||||
folly::Synchronized<Result<Void>, std::mutex> initResult_{Void{}};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
286
src/storage/aio/AioStatus.cc
Normal file
286
src/storage/aio/AioStatus.cc
Normal file
@@ -0,0 +1,286 @@
|
||||
#include "storage/aio/AioStatus.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <liburing.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::DistributionRecorder inflightNum("storage.aio.inflight");
|
||||
monitor::CountRecorder aioReadFailCount{"storage.aio.fail_count"};
|
||||
monitor::CountRecorder aioReadEIOCount{"storage.aio.eio_count"};
|
||||
|
||||
monitor::OperationRecorder ioCollectRecorder{"storage.io_collect"};
|
||||
|
||||
monitor::OperationRecorder ioSubmitRecorder{"storage.io_submit"};
|
||||
monitor::DistributionRecorder ioSubmitSize("storage.io_submit.size");
|
||||
monitor::DistributionRecorder ioSubmitLoop("storage.io_submit.loop");
|
||||
monitor::CountRecorder ioSubmitBadFd("storage.io_submit.badfd_count");
|
||||
monitor::CountRecorder ioSubmitError("storage.io_submit.error_count");
|
||||
monitor::OperationRecorder ioGetEventsRecorder{"storage.io_getevents"};
|
||||
monitor::DistributionRecorder ioGetEventsSize("storage.io_getevents.size");
|
||||
|
||||
void setReadJobResult(void *raw, int64_t res) {
|
||||
auto job = reinterpret_cast<AioReadJob *>(raw);
|
||||
auto storageTarget = job->state().storageTarget;
|
||||
if (res >= 0) {
|
||||
auto latency = RelativeTime::now() - job->startTime();
|
||||
storageTarget->recordRealRead(res, latency);
|
||||
auto length = std::min(std::min(std::max(0l, res - job->state().headLength), int64_t(job->readIO().length)),
|
||||
std::max(0l, int64_t(job->state().chunkLen) - job->readIO().offset));
|
||||
if (UNLIKELY(length == 0 && job->readIO().length > 0)) {
|
||||
XLOGF(WARNING, "read length is 0: {}, state: {}", job->readIO(), job->state());
|
||||
}
|
||||
job->setResult(length);
|
||||
// WARNING: job is no longer available.
|
||||
} else {
|
||||
if (storageTarget == nullptr) {
|
||||
aioReadFailCount.addSample(1);
|
||||
} else {
|
||||
aioReadFailCount.addSample(1, storageTarget->tag());
|
||||
}
|
||||
XLOGF(ERR,
|
||||
"set read job failed: {}, state: {}, buf: {}, code: {}",
|
||||
job->readIO(),
|
||||
job->state(),
|
||||
fmt::ptr(job->state().localbuf.ptr()),
|
||||
-res);
|
||||
job->setResult(makeError(StorageCode::kChunkReadFailed, fmt::format("errno: {}", -res)));
|
||||
// WARNING: job is no longer available.
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
AioStatus::~AioStatus() {
|
||||
if (aioContext_) {
|
||||
::io_destroy(aioContext_);
|
||||
}
|
||||
}
|
||||
|
||||
Result<Void> AioStatus::init(uint32_t maxEvents) {
|
||||
maxEvents_ = maxEvents;
|
||||
|
||||
// 1. init aio context.
|
||||
int ret = ::io_setup(maxEvents, &aioContext_);
|
||||
if (UNLIKELY(ret != 0)) {
|
||||
auto msg = fmt::format("init aio context failed: {}, maxEvents {}", ret, maxEvents);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StatusCode::kInvalidConfig, std::move(msg));
|
||||
}
|
||||
|
||||
// 2. init iocb.
|
||||
iocbs_.resize(maxEvents);
|
||||
availables_.reserve(maxEvents);
|
||||
for (auto &iocb : iocbs_) {
|
||||
availables_.push_back(&iocb);
|
||||
}
|
||||
events_.resize(maxEvents);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void AioStatus::collect() {
|
||||
auto recordGuard = ioCollectRecorder.record();
|
||||
while (availableToSubmit() && iterator_) {
|
||||
auto &job = *iterator_++;
|
||||
auto result = job.state().storageTarget->aioPrepareRead(job);
|
||||
if (UNLIKELY(!result)) {
|
||||
job.setResult(makeError(std::move(result.error())));
|
||||
continue;
|
||||
}
|
||||
|
||||
++readyToSubmit_;
|
||||
++inflight_;
|
||||
auto iocb = availables_.back();
|
||||
availables_.pop_back();
|
||||
auto &state = job.state();
|
||||
job.resetStartTime();
|
||||
::io_prep_pread(iocb, state.readFd, state.localbuf.ptr(), state.readLength, state.readOffset);
|
||||
iocb->data = &job;
|
||||
}
|
||||
recordGuard.succ();
|
||||
}
|
||||
|
||||
void AioStatus::submit() {
|
||||
uint32_t submitStartPoint = availables_.size();
|
||||
uint32_t loopCnt = 0;
|
||||
while (readyToSubmit_) {
|
||||
++loopCnt;
|
||||
auto recordGuard = ioSubmitRecorder.record();
|
||||
int ret = ::io_submit(aioContext_, readyToSubmit_, &availables_[submitStartPoint]);
|
||||
auto elapsedTime = RelativeTime::now() - recordGuard.startTime();
|
||||
if (UNLIKELY(elapsedTime >= 5_s)) {
|
||||
XLOGF(WARNING, "io_submit took too long {}, submit {} ret {}", elapsedTime.asMs(), readyToSubmit_, ret);
|
||||
}
|
||||
if (ret >= 0) {
|
||||
recordGuard.succ();
|
||||
ioSubmitSize.addSample(ret);
|
||||
submitStartPoint += ret;
|
||||
readyToSubmit_ -= ret;
|
||||
} else if (ret == -EAGAIN) {
|
||||
continue;
|
||||
} else if (ret == -EBADF) {
|
||||
XLOGF(ERR, "aio submit bad file descriptor {}. ret: {}", availables_[submitStartPoint]->aio_fildes, ret);
|
||||
// set failed and skip it.
|
||||
ioSubmitBadFd.addSample(1);
|
||||
setReadJobResult(availables_[submitStartPoint]->data, -EBADF);
|
||||
availables_.push_back(availables_[submitStartPoint]);
|
||||
++submitStartPoint;
|
||||
--readyToSubmit_;
|
||||
--inflight_;
|
||||
} else {
|
||||
ioSubmitError.addSample(1);
|
||||
XLOGF(ERR, "Unrecoverable aio submit error. ret: {}", ret);
|
||||
// set all jobs failed.
|
||||
while (readyToSubmit_) {
|
||||
setReadJobResult(availables_[submitStartPoint]->data, ret);
|
||||
availables_.push_back(availables_[submitStartPoint]);
|
||||
++submitStartPoint;
|
||||
--readyToSubmit_;
|
||||
--inflight_;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
ioSubmitLoop.addSample(loopCnt);
|
||||
inflightNum.addSample(inflight());
|
||||
}
|
||||
|
||||
void AioStatus::reap(uint32_t minCompleteIn) {
|
||||
uint32_t minComplete = std::min(inflight(), minCompleteIn);
|
||||
auto recordGuard = ioGetEventsRecorder.record();
|
||||
int ret = ::io_getevents(aioContext_, minComplete, inflight(), events_.data(), nullptr);
|
||||
if (LIKELY(ret >= 0)) {
|
||||
recordGuard.succ();
|
||||
ioGetEventsSize.addSample(ret);
|
||||
inflight_ -= ret;
|
||||
for (int i = 0; i < ret; ++i) {
|
||||
auto &event = events_[i];
|
||||
availables_.push_back(event.obj);
|
||||
setReadJobResult(event.data, event.res);
|
||||
}
|
||||
} else if (ret == -EINTR) {
|
||||
XLOGF(INFO, "aio is interrupted by a signal handler");
|
||||
return;
|
||||
} else {
|
||||
XLOGF(ERR, "aio io_getevents error: {}", ret);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
IoUringStatus::~IoUringStatus() {
|
||||
if (ring_.ring_fd) {
|
||||
::io_uring_queue_exit(&ring_);
|
||||
}
|
||||
}
|
||||
|
||||
Result<Void> IoUringStatus::init(uint32_t maxEvents,
|
||||
const std::vector<int> &fds,
|
||||
const std::vector<struct iovec> &iovecs) {
|
||||
maxEvents_ = maxEvents;
|
||||
|
||||
auto ret = ::io_uring_queue_init(maxEvents_, &ring_, 0);
|
||||
if (UNLIKELY(ret != 0)) {
|
||||
auto msg = fmt::format("init io uring failed: {}, maxEvents {}", ret, maxEvents);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StatusCode::kInvalidConfig, std::move(msg));
|
||||
}
|
||||
submittingJobs_.reserve(maxEvents_);
|
||||
|
||||
if (!fds.empty()) {
|
||||
int ret = ::io_uring_register_files(&ring_, fds.data(), fds.size());
|
||||
if (UNLIKELY(ret != 0)) {
|
||||
auto msg = fmt::format("io_uring_register_files failed: {}, size: {}", ret, fds.size());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StatusCode::kInvalidConfig, std::move(msg));
|
||||
}
|
||||
}
|
||||
if (!iovecs.empty()) {
|
||||
int ret = ::io_uring_register_buffers(&ring_, iovecs.data(), iovecs.size());
|
||||
if (UNLIKELY(ret != 0)) {
|
||||
auto msg = fmt::format("io_uring_register_buffers failed: {}, size: {}", ret, iovecs.size());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StatusCode::kInvalidConfig, std::move(msg));
|
||||
}
|
||||
}
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void IoUringStatus::collect() {
|
||||
auto recordGuard = ioCollectRecorder.record();
|
||||
while (availableToSubmit() && iterator_) {
|
||||
auto &job = *iterator_++;
|
||||
auto result = job.state().storageTarget->aioPrepareRead(job);
|
||||
if (UNLIKELY(!result)) {
|
||||
job.setResult(makeError(std::move(result.error())));
|
||||
continue;
|
||||
}
|
||||
|
||||
++inflight_;
|
||||
auto &state = job.state();
|
||||
|
||||
job.resetStartTime();
|
||||
struct io_uring_sqe *sqe = ::io_uring_get_sqe(&ring_);
|
||||
assert(sqe != nullptr);
|
||||
::io_uring_prep_read_fixed(sqe,
|
||||
state.fdIndex.value_or(state.readFd),
|
||||
state.localbuf.ptr(),
|
||||
state.readLength,
|
||||
state.readOffset,
|
||||
state.bufferIndex);
|
||||
if (state.fdIndex) {
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
}
|
||||
::io_uring_sqe_set_data(sqe, &job);
|
||||
submittingJobs_.push_back(&job);
|
||||
}
|
||||
recordGuard.succ();
|
||||
}
|
||||
|
||||
void IoUringStatus::submit() {
|
||||
auto recordGuard = ioSubmitRecorder.record();
|
||||
int ret = ::io_uring_submit(&ring_);
|
||||
if (LIKELY(ret >= 0)) {
|
||||
assert(ret == (int)inflight_);
|
||||
recordGuard.succ();
|
||||
ioSubmitSize.addSample(ret);
|
||||
} else {
|
||||
XLOGF(CRITICAL, "io_uring submit error: {}", ret);
|
||||
for (auto &job : submittingJobs_) {
|
||||
setReadJobResult(job, ret);
|
||||
}
|
||||
inflight_ -= submittingJobs_.size();
|
||||
}
|
||||
submittingJobs_.clear();
|
||||
}
|
||||
|
||||
void IoUringStatus::reap(uint32_t minCompleteIn) {
|
||||
auto recordGuard = ioGetEventsRecorder.record();
|
||||
io_uring_cqe *cqe = nullptr;
|
||||
int ret = ::io_uring_wait_cqes(&ring_, &cqe, std::min(inflight(), minCompleteIn), nullptr, nullptr);
|
||||
if (LIKELY(ret >= 0)) {
|
||||
recordGuard.succ();
|
||||
uint32_t cnt = 0;
|
||||
unsigned head = 0;
|
||||
io_uring_for_each_cqe(&ring_, head, cqe) {
|
||||
++cnt;
|
||||
setReadJobResult(::io_uring_cqe_get_data(cqe), cqe->res);
|
||||
}
|
||||
ioGetEventsSize.addSample(cnt);
|
||||
inflight_ -= cnt;
|
||||
::io_uring_cq_advance(&ring_, cnt);
|
||||
} else if (ret == -EINTR) {
|
||||
XLOGF(INFO, "io_uring is interrupted by a signal handler");
|
||||
return;
|
||||
} else {
|
||||
XLOGF(ERR, "io_uring wait_cqes error: {}", ret);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
73
src/storage/aio/AioStatus.h
Normal file
73
src/storage/aio/AioStatus.h
Normal file
@@ -0,0 +1,73 @@
|
||||
#pragma once
|
||||
|
||||
#include <libaio.h>
|
||||
#include <liburing.h>
|
||||
#include <vector>
|
||||
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class IoStatus {
|
||||
public:
|
||||
virtual ~IoStatus() = default;
|
||||
|
||||
bool hasUnfinishedBatchReadJob() const { return iterator_; }
|
||||
|
||||
void setAioReadJobIterator(AioReadJobIterator it) { iterator_ = it; }
|
||||
|
||||
bool availableToSubmit() const { return inflight_ < maxEvents_; }
|
||||
|
||||
uint32_t inflight() const { return inflight_; }
|
||||
|
||||
virtual void collect() = 0;
|
||||
|
||||
virtual void submit() = 0;
|
||||
|
||||
virtual void reap(uint32_t minCompleteIn) = 0;
|
||||
|
||||
protected:
|
||||
AioReadJobIterator iterator_;
|
||||
uint32_t maxEvents_ = 0;
|
||||
uint32_t inflight_ = 0;
|
||||
};
|
||||
|
||||
class AioStatus : public IoStatus {
|
||||
public:
|
||||
~AioStatus() override;
|
||||
|
||||
Result<Void> init(uint32_t maxEvents);
|
||||
|
||||
void collect() override;
|
||||
|
||||
void submit() override;
|
||||
|
||||
void reap(uint32_t minCompleteIn) override;
|
||||
|
||||
private:
|
||||
uint32_t readyToSubmit_ = 0;
|
||||
io_context_t aioContext_ = nullptr;
|
||||
std::vector<struct iocb> iocbs_;
|
||||
std::vector<struct iocb *> availables_;
|
||||
std::vector<struct io_event> events_;
|
||||
};
|
||||
|
||||
class IoUringStatus : public IoStatus {
|
||||
public:
|
||||
~IoUringStatus() override;
|
||||
|
||||
Result<Void> init(uint32_t maxEvents, const std::vector<int> &fds, const std::vector<struct iovec> &iovecs);
|
||||
|
||||
void collect() override;
|
||||
|
||||
void submit() override;
|
||||
|
||||
void reap(uint32_t minCompleteIn) override;
|
||||
|
||||
private:
|
||||
struct io_uring ring_ {};
|
||||
std::vector<AioReadJob *> submittingJobs_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
123
src/storage/aio/BatchReadJob.cc
Normal file
123
src/storage/aio/BatchReadJob.cc
Normal file
@@ -0,0 +1,123 @@
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "storage/store/StorageTarget.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::CountRecorder rdmaWriteCount{"storage.rdma_write.count"};
|
||||
monitor::CountRecorder rdmaWriteFails{"storage.rdma_write.fails"};
|
||||
monitor::CountRecorder rdmaWriteBytes{"storage.rdma_write.bytes"};
|
||||
monitor::LatencyRecorder batchReadLatency{"storage.aio.batch_latency"};
|
||||
|
||||
monitor::CountRecorder aioChecksumMismatch{"storage.aio.checksum_mismatch"};
|
||||
|
||||
AioReadJob::AioReadJob(const ReadIO &readIO, IOResult &result, BatchReadJob &batch)
|
||||
: readIO_(readIO),
|
||||
result_(result),
|
||||
batch_(batch) {
|
||||
state_.headLength = readIO_.offset % kAIOAlignSize;
|
||||
state_.tailLength = (kAIOAlignSize - (readIO_.offset + readIO_.length) % kAIOAlignSize) % kAIOAlignSize;
|
||||
}
|
||||
|
||||
void AioReadJob::setResult(Result<uint32_t> lengthInfo) {
|
||||
if (lengthInfo) {
|
||||
auto checksumType = batch_.checksumType();
|
||||
|
||||
if (checksumType == ChecksumType::NONE) {
|
||||
result_.checksum = {ChecksumType::NONE, 0U}; // do not return checksum
|
||||
} else if (checksumType == state_.chunkChecksum.type && readIO_.offset == 0 && *lengthInfo == state_.chunkLen) {
|
||||
result_.checksum = state_.chunkChecksum; // use chunk checksum if the full chunk is read
|
||||
} else { // calculate checksum of the read data
|
||||
auto dataBuf = state_.localbuf.subrange(state_.headLength, *lengthInfo);
|
||||
result_.checksum = ChecksumInfo::create(checksumType, dataBuf.ptr(), dataBuf.size());
|
||||
}
|
||||
|
||||
// check chunk version.
|
||||
auto result = state_.storageTarget->aioFinishRead(*this);
|
||||
if (UNLIKELY(!result)) {
|
||||
lengthInfo = makeError(std::move(result.error()));
|
||||
}
|
||||
|
||||
if (batch_.recalculateChecksum() && readIO_.offset == 0 && *lengthInfo == state_.chunkLen) {
|
||||
auto realChecksum = ChecksumInfo::create(state_.chunkChecksum.type, state_.localbuf.ptr(), *lengthInfo);
|
||||
if (UNLIKELY(realChecksum != state_.chunkChecksum)) {
|
||||
aioChecksumMismatch.addSample(1);
|
||||
auto msg = fmt::format("aio checksum mismatch, read: {}, state: {}, checksum: {}",
|
||||
readIO(),
|
||||
state(),
|
||||
realChecksum.value);
|
||||
XLOG(CRITICAL, msg);
|
||||
lengthInfo = makeError(StorageCode::kChecksumMismatch, std::move(msg));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XLOGF_IF(WARN, !lengthInfo, "Read job failed, result: {}, read io: {}, state: {}", lengthInfo, readIO_, state_);
|
||||
XLOGF(DBG7, "Read job completed, result: {}, read io: {}, state: {}", lengthInfo, readIO_, state_);
|
||||
|
||||
result_.lengthInfo = std::move(lengthInfo);
|
||||
state_.chunkEngineJob.reset();
|
||||
batch_.finish(this);
|
||||
}
|
||||
|
||||
BatchReadJob::BatchReadJob(std::span<const ReadIO> readIOs, std::span<IOResult> results, ChecksumType checksumType)
|
||||
: checksumType_(checksumType) {
|
||||
auto batchSize = readIOs.size();
|
||||
jobs_.reserve(batchSize);
|
||||
for (auto i = 0ul; i < batchSize; ++i) {
|
||||
jobs_.emplace_back(readIOs[i], results[i], *this);
|
||||
}
|
||||
}
|
||||
|
||||
size_t BatchReadJob::addBufferToBatch(serde::CallContext::RDMATransmission &batch) {
|
||||
size_t writeCount = 0;
|
||||
size_t writeBytes = 0;
|
||||
for (auto &job : jobs_) {
|
||||
if (job.result().lengthInfo) {
|
||||
auto length = *job.result().lengthInfo;
|
||||
auto localbuf = job.state().localbuf.subrange(job.state().headLength, length);
|
||||
auto result = batch.add(job.readIO().rdmabuf, localbuf);
|
||||
if (UNLIKELY(!result)) {
|
||||
rdmaWriteFails.addSample(1);
|
||||
job.result().lengthInfo = makeError(std::move(result.error()));
|
||||
} else {
|
||||
++writeCount;
|
||||
writeBytes += length;
|
||||
}
|
||||
}
|
||||
}
|
||||
rdmaWriteCount.addSample(writeCount);
|
||||
rdmaWriteBytes.addSample(writeBytes);
|
||||
return writeBytes;
|
||||
}
|
||||
|
||||
size_t BatchReadJob::copyToRespBuffer(std::vector<uint8_t> &buffer) {
|
||||
size_t sendBytes = 0;
|
||||
for (auto &job : jobs_) {
|
||||
if (job.result().lengthInfo) {
|
||||
// check chunk version.
|
||||
auto length = *job.result().lengthInfo;
|
||||
auto localbuf = job.state().localbuf.subrange(job.state().headLength, length);
|
||||
size_t bufEnd = buffer.size();
|
||||
|
||||
if (buffer.empty()) buffer.reserve(localbuf.size() * jobs_.size());
|
||||
buffer.resize(buffer.size() + localbuf.size());
|
||||
std::memcpy(&buffer[bufEnd], localbuf.ptr(), localbuf.size());
|
||||
|
||||
sendBytes += length;
|
||||
}
|
||||
}
|
||||
return sendBytes;
|
||||
}
|
||||
|
||||
void BatchReadJob::finish(AioReadJob *job) {
|
||||
(void)job;
|
||||
if (++finishedCount_ == jobs_.size()) {
|
||||
batchReadLatency.addSample(RelativeTime::now() - startTime());
|
||||
baton_.post();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
145
src/storage/aio/BatchReadJob.h
Normal file
145
src/storage/aio/BatchReadJob.h
Normal file
@@ -0,0 +1,145 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/experimental/coro/Baton.h>
|
||||
#include <utility>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "common/net/ib/IBSocket.h"
|
||||
#include "common/serde/CallContext.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class BatchReadJob;
|
||||
class StorageTarget;
|
||||
|
||||
class ChunkEngineReadJob {
|
||||
public:
|
||||
ChunkEngineReadJob() = default;
|
||||
ChunkEngineReadJob(const ChunkEngineReadJob &) = delete;
|
||||
ChunkEngineReadJob(ChunkEngineReadJob &&other)
|
||||
: engine_(std::exchange(other.engine_, nullptr)),
|
||||
chunk_(std::exchange(other.chunk_, nullptr)) {}
|
||||
|
||||
void set(chunk_engine::Engine *engine, const chunk_engine::Chunk *chunk) {
|
||||
reset();
|
||||
engine_ = engine;
|
||||
chunk_ = chunk;
|
||||
}
|
||||
|
||||
void reset() {
|
||||
if (engine_ && chunk_) {
|
||||
std::exchange(engine_, nullptr)->release_raw_chunk(chunk_);
|
||||
}
|
||||
}
|
||||
|
||||
auto chunk() const { return chunk_; }
|
||||
|
||||
bool has_chunk() const { return chunk_ != nullptr; }
|
||||
|
||||
~ChunkEngineReadJob() { reset(); }
|
||||
|
||||
private:
|
||||
chunk_engine::Engine *engine_{};
|
||||
const chunk_engine::Chunk *chunk_{};
|
||||
};
|
||||
|
||||
class AioReadJob {
|
||||
public:
|
||||
AioReadJob(const ReadIO &readIO, IOResult &result, BatchReadJob &batch);
|
||||
|
||||
auto &readIO() { return readIO_; }
|
||||
auto &result() { return result_; }
|
||||
auto &batch() { return batch_; }
|
||||
auto &state() { return state_; }
|
||||
|
||||
void setResult(Result<uint32_t> lengthInfo);
|
||||
|
||||
uint32_t alignedOffset() const { return readIO_.offset - state_.headLength; }
|
||||
uint32_t alignedLength() const { return readIO_.length + state_.headLength + state_.tailLength; }
|
||||
|
||||
auto startTime() const { return startTime_; }
|
||||
void resetStartTime() { startTime_ = RelativeTime::now(); }
|
||||
|
||||
private:
|
||||
const ReadIO &readIO_;
|
||||
IOResult &result_;
|
||||
BatchReadJob &batch_;
|
||||
struct State {
|
||||
net::RDMABuf localbuf{};
|
||||
StorageTarget *storageTarget = nullptr;
|
||||
ChunkEngineReadJob chunkEngineJob{};
|
||||
SERDE_STRUCT_FIELD(headLength, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(tailLength, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(readLength, uint32_t{}); // after cropping.
|
||||
SERDE_STRUCT_FIELD(readFd, int32_t{});
|
||||
SERDE_STRUCT_FIELD(readOffset, uint64_t{});
|
||||
SERDE_STRUCT_FIELD(chunkLen, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(bufferIndex, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(fdIndex, std::optional<uint32_t>{});
|
||||
SERDE_STRUCT_FIELD(chunkChecksum, ChecksumInfo{});
|
||||
SERDE_STRUCT_FIELD(readUncommitted, false);
|
||||
} state_;
|
||||
static_assert(serde::Serializable<State>);
|
||||
RelativeTime startTime_{};
|
||||
};
|
||||
|
||||
class BatchReadJob {
|
||||
public:
|
||||
BatchReadJob(std::span<const ReadIO> readIOs, std::span<IOResult> results, ChecksumType checksumType);
|
||||
BatchReadJob(const ReadIO &readIO, StorageTarget *target, IOResult &result, ChecksumType checksumType)
|
||||
: BatchReadJob(std::span(&readIO, 1), std::span(&result, 1), checksumType) {
|
||||
jobs_.back().state().storageTarget = target;
|
||||
}
|
||||
CoTask<void> complete() { co_await baton_; }
|
||||
size_t addBufferToBatch(serde::CallContext::RDMATransmission &batch);
|
||||
size_t copyToRespBuffer(std::vector<uint8_t> &buffer);
|
||||
void finish(AioReadJob *job);
|
||||
auto checksumType() const { return checksumType_; }
|
||||
bool recalculateChecksum() const { return recalculateChecksum_; }
|
||||
void setRecalculateChecksum(bool value = true) { recalculateChecksum_ = value; }
|
||||
auto &front() { return jobs_.front(); }
|
||||
auto &front() const { return jobs_.front(); }
|
||||
auto startTime() const { return startTime_.load(); }
|
||||
void resetStartTime() { startTime_ = RelativeTime::now(); }
|
||||
|
||||
private:
|
||||
friend class AioReadJobIterator;
|
||||
std::vector<AioReadJob> jobs_;
|
||||
folly::coro::Baton baton_;
|
||||
std::atomic<uint64_t> finishedCount_{};
|
||||
std::atomic<RelativeTime> startTime_ = RelativeTime::now();
|
||||
const ChecksumType checksumType_;
|
||||
bool recalculateChecksum_ = false;
|
||||
};
|
||||
|
||||
class AioReadJobIterator {
|
||||
public:
|
||||
AioReadJobIterator() = default;
|
||||
AioReadJobIterator(BatchReadJob *batch)
|
||||
: batch_(batch),
|
||||
end_(batch->jobs_.size()) {}
|
||||
AioReadJobIterator(BatchReadJob *batch, uint32_t start, uint32_t size)
|
||||
: batch_(batch),
|
||||
begin_(start),
|
||||
end_(std::min((uint32_t)batch->jobs_.size(), start + size)) {}
|
||||
|
||||
operator bool() const { return begin_ < end_; }
|
||||
bool isNull() const { return batch_ == nullptr; }
|
||||
AioReadJob &operator*() { return batch_->jobs_[begin_]; }
|
||||
AioReadJob *operator->() { return &batch_->jobs_[begin_]; }
|
||||
AioReadJob *operator++(int) { return &batch_->jobs_[begin_++]; }
|
||||
|
||||
auto startTime() const { return startTime_; }
|
||||
auto resetStartTime() { startTime_ = RelativeTime::now(); }
|
||||
|
||||
private:
|
||||
BatchReadJob *batch_ = nullptr;
|
||||
uint32_t begin_ = 0;
|
||||
uint32_t end_ = 0;
|
||||
RelativeTime startTime_ = RelativeTime::now();
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
2
src/storage/chunk_engine/.gitignore
vendored
Normal file
2
src/storage/chunk_engine/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
/target
|
||||
/lcov.info
|
||||
40
src/storage/chunk_engine/Cargo.toml
Normal file
40
src/storage/chunk_engine/Cargo.toml
Normal file
@@ -0,0 +1,40 @@
|
||||
[package]
|
||||
name = "chunk_engine"
|
||||
version = "0.1.11"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
crate-type = ["lib", "staticlib"]
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1"
|
||||
byteorder = "1"
|
||||
crc32c = "0"
|
||||
cxx = "1"
|
||||
dashmap = "6"
|
||||
derse = { version = ">=0.1.32", features = ["tinyvec"] }
|
||||
lazy_static = "1"
|
||||
libc = "0"
|
||||
lockmap = "0.1.6"
|
||||
rand = "0"
|
||||
rocksdb = "0"
|
||||
rolling-file = "0"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
static_assertions = "1"
|
||||
tinyvec = { version = "1", features = ["alloc"] }
|
||||
toml = "0"
|
||||
tracing = "0"
|
||||
tracing-appender = "0"
|
||||
tracing-subscriber = { version = "0", features = ["fmt"] }
|
||||
|
||||
[dev-dependencies]
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
tempfile = "3"
|
||||
criterion = "0"
|
||||
|
||||
[build-dependencies]
|
||||
cxx-build = "1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_allocator"
|
||||
harness = false
|
||||
62
src/storage/chunk_engine/README.md
Normal file
62
src/storage/chunk_engine/README.md
Normal file
@@ -0,0 +1,62 @@
|
||||
# chunk-engine
|
||||
|
||||
### Design
|
||||
|
||||
1. The entire Chunk Engine can be divided into two components:
|
||||
1. **Allocator**: Responsible for allocating/reclaiming chunks and modifying memory states.
|
||||
2. **MetaStore**: Responsible for persisting allocation/reclamation events.
|
||||
2. Workflow for writing a new chunk:
|
||||
1. The **Allocator** assigns a new chunk position, pointing to a disk space (purely in-memory operation).
|
||||
2. Write data to this chunk position. If a power failure or write failure occurs at this stage, no existing data is affected.
|
||||
3. Generate corresponding chunk metadata and persist it alongside the allocation event to the **MetaStore**. Using RocksDB's WriteBatch ensures **atomic** updates—the entire write operation either succeeds or fails, with no intermediate states.
|
||||
3. Maintaining the Allocator's in-memory state:
|
||||
1. At startup, the Allocator **quickly** loads all allocation information from RocksDB.
|
||||
2. Allocation is performed in-memory first, followed by persistence. If a failure occurs before persistence, the allocation event is lost.
|
||||
3. Reclamation first persists the event to disk, then modifies the memory state. Even if a chunk deletion event is persisted, the chunk remains readable as long as memory holds its reference.
|
||||
4. This ensures conflict-free read/write operations: a read operation acquires a chunk reference, guaranteeing the chunk's validity until the read completes.
|
||||
4. Use `Arc` to manage ownership of chunk position:
|
||||
1. For allocation, returns an `Arc<ChunkPos>`. If persistence fails, the position is automatically released when the `Arc` is dropped.
|
||||
2. Read operations also return an `Arc<ChunkPos>`, ensuring safe data access even during concurrent writes or deletions.
|
||||
|
||||
### Allocator
|
||||
|
||||
Storage hierarchy:
|
||||
|
||||
1. **Chunk**: Basic data unit, currently proposed as 64KB, 512KB, and 4MB.
|
||||
2. **Group**: Each group contains 256 chunks (16MB, 128MB, or 1GB depending on chunk size).
|
||||
3. **File**: For 512KB chunks, a single file (~120GB) contains ~960 groups.
|
||||
4. **Disk**: Single disk capacity of 30TB, divided into 256 files per chunk size.
|
||||
5. **Node**: A single node contains 10–20 disks.
|
||||
|
||||
This configuration supports up to ~1.2 billion chunks and ~5 million groups per machine.
|
||||
|
||||
Implementation details:
|
||||
1. Each group uses a 256-bit bitset (4 `uint64_t`) to track allocation status.
|
||||
2. Maintain three in-memory structures:
|
||||
- `allocated_groups`: Groups with allocated space but no chunks assigned.
|
||||
- `unallocated_groups`: Groups without allocated space.
|
||||
- `active_groups`: Map of `<group_id, group_state>` tracking allocation status.
|
||||
3. Chunk allocation workflow:
|
||||
1. Prioritize finding free slots in `active_groups` using **`__builtin_ctz`** for fast bitwise operations.
|
||||
2. If `active_groups` is empty, acquire a new group from `allocated_groups`.
|
||||
3. If `allocated_groups` is empty, fetch a group from `unallocated_groups` and allocate disk space synchronously.
|
||||
4. Background threads:
|
||||
- **`allocate_thread`**: Maintains `active_groups` within a target size range to ensure in-memory allocation efficiency.
|
||||
- **`compact_thread`**: Periodically scans `active_groups`, migrates all chunks from selected groups, releases space, and returns groups to `allocated_groups`.
|
||||
|
||||
### MetaStore
|
||||
|
||||
Persists three mappings:
|
||||
1. **`chunk_id -> chunk_meta`**: Metadata includes chunk location, length, hash, version, etc., serialized using **`derse`**.
|
||||
2. **`group_id -> group_state`**: Tracks chunk allocation status within groups, leveraging RocksDB's **MergeOp** for atomic updates.
|
||||
3. **`chunk_pos -> chunk_id`**: Maps physical positions to chunk IDs, used by `compact_thread` during chunk migration.
|
||||
|
||||
### Chunk Engine
|
||||
|
||||
1. **MetaCache**: Maintains an in-memory `chunk_id -> chunk_info` mapping, where `chunk_info` includes `chunk_meta` and `Arc<ChunkPos>`.
|
||||
2. **Read operation**: Returns `chunk_info`. The `Arc<ChunkPos>` ensures safe data access until the read completes.
|
||||
3. **Write operation workflow**:
|
||||
1. Query `MetaCache` to retrieve the current `chunk_info`.
|
||||
2. Invoke `Allocator::allocate()` to obtain a new chunk position.
|
||||
3. Read existing chunk data, write it to the new chunk position, append the new write request, and generate `new_chunk_info`.
|
||||
4. Persist `new_chunk_info` to the **MetaStore** along with a release record for the original chunk position.
|
||||
42
src/storage/chunk_engine/benches/bench_allocator.rs
Normal file
42
src/storage/chunk_engine/benches/bench_allocator.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use chunk_engine::*;
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
use std::sync::Arc;
|
||||
|
||||
fn allocate(allocator: &Arc<Allocator>, n: usize) {
|
||||
for _ in 0..n {
|
||||
drop(allocator.allocate(true).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let cluster_config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
let clusters = Clusters::open(&cluster_config).unwrap();
|
||||
|
||||
let meta_store_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().join("meta"),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let meta_store = std::sync::Arc::new(MetaStore::open(&meta_store_config).unwrap());
|
||||
|
||||
let allocator = chunk_engine::Allocator::load(clusters, meta_store.iterator()).unwrap();
|
||||
allocator.do_allocate_task(1, 1, &meta_store).unwrap();
|
||||
|
||||
let count: usize = 1 << 16;
|
||||
|
||||
c.bench_with_input(BenchmarkId::new("allocate", count), &count, |b, &c| {
|
||||
b.iter(|| allocate(&allocator, c))
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
4
src/storage/chunk_engine/build.rs
Normal file
4
src/storage/chunk_engine/build.rs
Normal file
@@ -0,0 +1,4 @@
|
||||
fn main() {
|
||||
let _ = cxx_build::bridge("src/cxx.rs");
|
||||
println!("cargo:rerun-if-changed=src/cxx.rs");
|
||||
}
|
||||
182
src/storage/chunk_engine/docs/architecture.drawio.svg
Normal file
182
src/storage/chunk_engine/docs/architecture.drawio.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 77 KiB |
94
src/storage/chunk_engine/examples/chunk_viewer.rs
Normal file
94
src/storage/chunk_engine/examples/chunk_viewer.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
use chunk_engine::*;
|
||||
use clap::Parser;
|
||||
use derse::Deserialize;
|
||||
use std::{
|
||||
collections::{BTreeMap, HashMap},
|
||||
path::PathBuf,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
/// A distributed copy/move tool.
|
||||
#[derive(Parser, Debug, Clone)]
|
||||
#[command(version, about, long_about = None)]
|
||||
pub struct Args {
|
||||
/// Path to rocksdb.
|
||||
pub path: PathBuf,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
let meta_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: args.path,
|
||||
create: false,
|
||||
read_only: true,
|
||||
},
|
||||
prefix_len: 4,
|
||||
};
|
||||
let meta_store = MetaStore::open(&meta_config)?;
|
||||
|
||||
let mut chunk_allocators = HashMap::new();
|
||||
let mut used_map = BTreeMap::new();
|
||||
let mut reversed_map = BTreeMap::new();
|
||||
let mut group_count = BTreeMap::new();
|
||||
let mut chunk_size = CHUNK_SIZE_SMALL;
|
||||
let mut real_map = BTreeMap::new();
|
||||
loop {
|
||||
let counter = Arc::new(AllocatorCounter::new(chunk_size));
|
||||
let it = meta_store.iterator();
|
||||
let chunk_allocator = ChunkAllocator::load(it, counter.clone(), chunk_size)?;
|
||||
let allocated_chunks = counter.allocated_chunks();
|
||||
let reserved_chunks = counter.reserved_chunks();
|
||||
used_map.insert(chunk_size, allocated_chunks - reserved_chunks);
|
||||
reversed_map.insert(chunk_size, reserved_chunks);
|
||||
group_count.insert(
|
||||
chunk_size,
|
||||
(
|
||||
chunk_allocator.full_groups.len(),
|
||||
chunk_allocator.active_groups.len(),
|
||||
),
|
||||
);
|
||||
real_map.insert(chunk_size, 0u64);
|
||||
chunk_allocators.insert(chunk_size, chunk_allocator);
|
||||
|
||||
if chunk_size >= CHUNK_SIZE_ULTRA {
|
||||
break;
|
||||
}
|
||||
chunk_size *= 2;
|
||||
}
|
||||
|
||||
let mut it = meta_store.iterator();
|
||||
let end_key = MetaKey::chunk_meta_key_prefix();
|
||||
it.seek(&end_key)?;
|
||||
|
||||
if it.key() == Some(end_key.as_ref()) {
|
||||
it.next(); // [begin, end)
|
||||
}
|
||||
|
||||
loop {
|
||||
if !it.valid() {
|
||||
break;
|
||||
}
|
||||
|
||||
if it.key().unwrap()[0] != MetaKey::CHUNK_META_KEY_PREFIX {
|
||||
break;
|
||||
}
|
||||
|
||||
let chunk_meta =
|
||||
ChunkMeta::deserialize(it.value().unwrap()).map_err(Error::SerializationError)?;
|
||||
|
||||
let chunk_size = chunk_meta.pos.chunk_size();
|
||||
let allocator = chunk_allocators.get_mut(&chunk_size).unwrap();
|
||||
allocator.reference(chunk_meta.pos, true);
|
||||
real_map.entry(chunk_size).and_modify(|v| *v += 1);
|
||||
|
||||
it.next();
|
||||
}
|
||||
println!("{:#?}", used_map);
|
||||
println!("{:#?}", reversed_map);
|
||||
println!("{:#?}", group_count);
|
||||
assert_eq!(used_map, real_map);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
258
src/storage/chunk_engine/src/alloc/allocator.rs
Normal file
258
src/storage/chunk_engine/src/alloc/allocator.rs
Normal file
@@ -0,0 +1,258 @@
|
||||
use super::super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub struct Allocator {
|
||||
allocator: Mutex<ChunkAllocator>,
|
||||
pub counter: Arc<AllocatorCounter>,
|
||||
pub clusters: Clusters,
|
||||
}
|
||||
|
||||
impl Allocator {
|
||||
pub fn load(clusters: Clusters, it: RocksDBIterator) -> Result<Arc<Allocator>> {
|
||||
let counter = Arc::new(AllocatorCounter::new(clusters.chunk_size));
|
||||
Ok(Arc::new(Self {
|
||||
allocator: Mutex::new(ChunkAllocator::load(
|
||||
it,
|
||||
counter.clone(),
|
||||
clusters.chunk_size,
|
||||
)?),
|
||||
counter,
|
||||
clusters,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn allocate(self: &Arc<Self>, allow_to_allocate: bool) -> Result<Chunk> {
|
||||
let this = self.as_ref();
|
||||
let mut allocator = this.allocator.lock().unwrap();
|
||||
allocator
|
||||
.allocate(&this.clusters, allow_to_allocate)
|
||||
.map(|pos| {
|
||||
Chunk::new(
|
||||
ChunkMeta {
|
||||
pos,
|
||||
..Default::default()
|
||||
},
|
||||
self.clone(),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn reference(self: &Arc<Self>, meta: ChunkMeta, first_ref: bool) -> Chunk {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator.reference(meta.pos, first_ref);
|
||||
Chunk::new(meta, self.clone())
|
||||
}
|
||||
|
||||
pub fn dereference(&self, pos: Position) {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator.dereference(pos)
|
||||
}
|
||||
|
||||
pub fn get_allocate_task(&self, min_remain: usize, max_remain: usize) -> AllocateTask {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator
|
||||
.group_allocator
|
||||
.get_allocate_task(min_remain, max_remain)
|
||||
}
|
||||
|
||||
pub fn finish_allocate_task(&self, task: AllocateTask, succ: bool) {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator.group_allocator.finish_allocate_task(task, succ);
|
||||
}
|
||||
|
||||
pub fn do_allocate_task(
|
||||
&self,
|
||||
min_remain: usize,
|
||||
max_remain: usize,
|
||||
meta_store: &MetaStore,
|
||||
) -> Result<AllocateTask> {
|
||||
let task = self.get_allocate_task(min_remain, max_remain);
|
||||
|
||||
let result = match task {
|
||||
AllocateTask::None => return Ok(task),
|
||||
AllocateTask::Allocate(group_id) => (|| {
|
||||
self.clusters.allocate(group_id)?;
|
||||
meta_store.allocate_group(group_id)
|
||||
})(),
|
||||
AllocateTask::Deallocate(group_id) => (|| {
|
||||
tracing::warn!("deallocate group: {:?}", group_id);
|
||||
meta_store.remove_group(group_id)?;
|
||||
self.clusters.deallocate(group_id)
|
||||
})(),
|
||||
};
|
||||
|
||||
self.finish_allocate_task(task, result.is_ok());
|
||||
|
||||
result?;
|
||||
Ok(task)
|
||||
}
|
||||
|
||||
pub fn get_compact_task(&self, max_reserved: u64) -> Option<GroupId> {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator.get_compact_task(max_reserved)
|
||||
}
|
||||
|
||||
pub fn finish_compact_task(&self, group_id: GroupId) {
|
||||
let mut allocator = self.allocator.lock().unwrap();
|
||||
allocator.finish_compact_task(group_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Allocator {
|
||||
fn drop(&mut self) {
|
||||
tracing::info!("Allocator {:?} is dropping...", self.clusters);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_allocator() {
|
||||
use rand::seq::SliceRandom;
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let cluster_config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
let clusters = Clusters::open(&cluster_config).unwrap();
|
||||
|
||||
let meta_store_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().join("meta"),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let meta_store = Arc::new(MetaStore::open(&meta_store_config).unwrap());
|
||||
|
||||
let allocator = Allocator::load(clusters, meta_store.iterator()).unwrap();
|
||||
|
||||
for _ in 0..10000 {
|
||||
let chunk = allocator.allocate(true).unwrap();
|
||||
assert_eq!(chunk.meta().pos, Position::new(GroupId::default(), 0));
|
||||
}
|
||||
|
||||
const N: usize = 1000;
|
||||
let mut chunks = vec![];
|
||||
for _ in 0..N {
|
||||
let chunk = allocator.allocate(true).unwrap();
|
||||
chunks.push(std::sync::Arc::new(chunk));
|
||||
}
|
||||
|
||||
{
|
||||
let allocator = allocator.allocator.lock().unwrap();
|
||||
assert_eq!(allocator.full_groups.len(), N / 256);
|
||||
assert_eq!(allocator.active_groups.len(), 1);
|
||||
assert_eq!(
|
||||
allocator.active_groups.iter().next().unwrap().1.count() as usize,
|
||||
N % 256
|
||||
);
|
||||
}
|
||||
|
||||
const T: usize = 8;
|
||||
(0..T)
|
||||
.map(|i| {
|
||||
let chunks = chunks.clone();
|
||||
std::thread::spawn(move || {
|
||||
let mut vec = create_aligned_vec(ALIGN_SIZE);
|
||||
vec.fill(0);
|
||||
for chunk in chunks.iter() {
|
||||
if chunk.meta().pos.index() as usize % T == i {
|
||||
vec.fill(chunk.meta().pos.index());
|
||||
chunk.pwrite(&vec[..], 0).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.for_each(|t| t.join().unwrap());
|
||||
|
||||
chunks.shuffle(&mut rand::thread_rng());
|
||||
|
||||
(0..T)
|
||||
.map(|i| {
|
||||
let chunks = chunks.clone();
|
||||
std::thread::spawn(move || {
|
||||
let mut buf = [0u8; 8];
|
||||
for chunk in chunks.iter() {
|
||||
if chunk.meta().pos.index() as usize % T == i {
|
||||
assert!(chunk.pread(&mut buf, 0).is_ok());
|
||||
assert_eq!(buf, [chunk.meta().pos.index(); 8]);
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.into_iter()
|
||||
.for_each(|t| t.join().unwrap());
|
||||
|
||||
chunks.clear();
|
||||
|
||||
{
|
||||
let allocator = allocator.allocator.lock().unwrap();
|
||||
assert!(allocator.full_groups.is_empty());
|
||||
assert!(allocator.active_groups.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocator_do_allocate_task() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
const S: Size = CHUNK_SIZE_NORMAL;
|
||||
|
||||
let cluster_config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: S,
|
||||
create: true,
|
||||
};
|
||||
let clusters = Clusters::open(&cluster_config).unwrap();
|
||||
|
||||
let meta_store_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().join("meta"),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
let meta_store = Arc::new(MetaStore::open(&meta_store_config).unwrap());
|
||||
|
||||
let allocator = Allocator::load(clusters, meta_store.iterator()).unwrap();
|
||||
|
||||
for _ in 0..4 {
|
||||
assert!(matches!(
|
||||
allocator.do_allocate_task(4, 8, &meta_store).unwrap(),
|
||||
AllocateTask::Allocate(_)
|
||||
));
|
||||
}
|
||||
assert!(matches!(
|
||||
allocator.do_allocate_task(4, 8, &meta_store).unwrap(),
|
||||
AllocateTask::None
|
||||
));
|
||||
|
||||
let s = allocator.counter.used_size();
|
||||
assert_eq!(s.allocated_size, S * GroupState::TOTAL_BITS as u64 * 4);
|
||||
assert_eq!(s.reserved_size, S * GroupState::TOTAL_BITS as u64 * 4);
|
||||
|
||||
for _ in 2..4 {
|
||||
assert!(matches!(
|
||||
allocator.do_allocate_task(1, 2, &meta_store).unwrap(),
|
||||
AllocateTask::Deallocate(_)
|
||||
));
|
||||
}
|
||||
assert!(matches!(
|
||||
allocator.do_allocate_task(1, 2, &meta_store).unwrap(),
|
||||
AllocateTask::None
|
||||
));
|
||||
|
||||
let s = allocator.counter.used_size();
|
||||
assert_eq!(s.allocated_size, S * GroupState::TOTAL_BITS as u64 * 2);
|
||||
assert_eq!(s.reserved_size, S * GroupState::TOTAL_BITS as u64 * 2);
|
||||
}
|
||||
}
|
||||
88
src/storage/chunk_engine/src/alloc/allocator_counter.rs
Normal file
88
src/storage/chunk_engine/src/alloc/allocator_counter.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use super::super::*;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AllocatorCounter {
|
||||
pub chunk_size: Size,
|
||||
pub allocated_chunks: AtomicU64,
|
||||
pub reserved_chunks: AtomicU64,
|
||||
pub position_count: AtomicU64,
|
||||
pub position_rc: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
#[repr(C)]
|
||||
pub struct UsedSize {
|
||||
pub allocated_size: Size,
|
||||
pub reserved_size: Size,
|
||||
pub position_count: u64,
|
||||
pub position_rc: u64,
|
||||
}
|
||||
|
||||
impl std::iter::Sum for UsedSize {
|
||||
fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
|
||||
let mut s = UsedSize::default();
|
||||
for i in iter {
|
||||
s.allocated_size += i.allocated_size;
|
||||
s.reserved_size += i.reserved_size;
|
||||
s.position_count += i.position_count;
|
||||
s.position_rc += i.position_rc;
|
||||
}
|
||||
s
|
||||
}
|
||||
}
|
||||
|
||||
impl AllocatorCounter {
|
||||
pub fn new(chunk_size: Size) -> Self {
|
||||
Self {
|
||||
chunk_size,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocated_chunks(&self) -> u64 {
|
||||
self.allocated_chunks.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub fn reserved_chunks(&self) -> u64 {
|
||||
self.reserved_chunks.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub fn used_size(&self) -> UsedSize {
|
||||
UsedSize {
|
||||
allocated_size: self.allocated_chunks() * self.chunk_size,
|
||||
reserved_size: self.reserved_chunks() * self.chunk_size,
|
||||
position_count: self.position_count.load(Ordering::Acquire),
|
||||
position_rc: self.position_rc.load(Ordering::Acquire),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self, allocated_count: u64, reserved_count: u64) {
|
||||
self.allocated_chunks
|
||||
.store(allocated_count, Ordering::Release);
|
||||
self.reserved_chunks
|
||||
.store(reserved_count, Ordering::Release);
|
||||
}
|
||||
|
||||
pub fn allocate_group(&self) {
|
||||
self.allocated_chunks
|
||||
.fetch_add(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
|
||||
self.reserved_chunks
|
||||
.fetch_add(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
pub fn deallocate_group(&self) {
|
||||
self.allocated_chunks
|
||||
.fetch_sub(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
|
||||
self.reserved_chunks
|
||||
.fetch_sub(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
pub fn allocate_chunk(&self) {
|
||||
self.reserved_chunks.fetch_sub(1, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
pub fn deallocate_chunk(&self) {
|
||||
self.reserved_chunks.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
200
src/storage/chunk_engine/src/alloc/allocators.rs
Normal file
200
src/storage/chunk_engine/src/alloc/allocators.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
use super::super::*;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Allocators {
|
||||
pub vec: [Arc<Allocator>; CHUNK_SIZE_NUMBER],
|
||||
meta_store: Arc<MetaStore>,
|
||||
}
|
||||
|
||||
impl Allocators {
|
||||
pub fn new(path: &Path, create: bool, meta_store: Arc<MetaStore>) -> Result<Self> {
|
||||
let mut allocators = vec![];
|
||||
for i in 0..CHUNK_SIZE_NUMBER {
|
||||
let chunk_size = CHUNK_SIZE_SMALL * (1 << i);
|
||||
let allocator = Self::create(path, create, &meta_store, chunk_size)?;
|
||||
allocators.push(allocator);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
vec: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map(|i| allocators[i].clone()),
|
||||
meta_store,
|
||||
})
|
||||
}
|
||||
|
||||
fn create(
|
||||
path: &Path,
|
||||
create: bool,
|
||||
meta_store: &Arc<MetaStore>,
|
||||
chunk_size: Size,
|
||||
) -> Result<Arc<Allocator>> {
|
||||
let cluster_config = ClustersConfig {
|
||||
path: path.join(chunk_size.to_string()),
|
||||
chunk_size,
|
||||
create,
|
||||
};
|
||||
let clusters = Clusters::open(&cluster_config)?;
|
||||
let allocator = Allocator::load(clusters, meta_store.iterator())?;
|
||||
tracing::info!("Allocator {:?} is created...", allocator.clusters);
|
||||
Result::Ok(allocator)
|
||||
}
|
||||
|
||||
pub fn select_by_pos(&self, pos: Position) -> Result<&Arc<Allocator>> {
|
||||
let chunk_size = pos.chunk_size();
|
||||
if chunk_size.is_power_of_two()
|
||||
&& CHUNK_SIZE_SMALL <= chunk_size
|
||||
&& chunk_size <= CHUNK_SIZE_ULTRA
|
||||
{
|
||||
Ok(&self.vec[chunk_size.trailing_zeros() as usize - CHUNK_SIZE_SHIFT])
|
||||
} else {
|
||||
Err(Error::InvalidArg(format!(
|
||||
"select allocator invalid pos: {pos:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn select_by_size(&self, size: Size) -> Result<&Arc<Allocator>> {
|
||||
if size <= CHUNK_SIZE_SMALL {
|
||||
Ok(&self.vec[0])
|
||||
} else if size <= CHUNK_SIZE_ULTRA {
|
||||
Ok(&self.vec[size.next_power_of_two().trailing_zeros() as usize - CHUNK_SIZE_SHIFT])
|
||||
} else {
|
||||
Err(Error::InvalidArg(format!(
|
||||
"select allocator invalid size: {size:?}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate(&self, size: Size, allow_to_allocate: bool) -> Result<Chunk> {
|
||||
let allocator = self.select_by_size(size)?;
|
||||
allocator.allocate(allow_to_allocate)
|
||||
}
|
||||
|
||||
pub fn allocate_groups(
|
||||
&self,
|
||||
min_remain: usize,
|
||||
max_remain: usize,
|
||||
batch_size: usize,
|
||||
allocate_ultra_groups: bool,
|
||||
) -> usize {
|
||||
let mut finish = 0usize;
|
||||
for allocator in &self.vec {
|
||||
let is_ultra = allocator.clusters.chunk_size > CHUNK_SIZE_LARGE;
|
||||
if is_ultra != allocate_ultra_groups {
|
||||
continue;
|
||||
}
|
||||
for _ in 0..batch_size {
|
||||
match allocator.do_allocate_task(min_remain, max_remain, &self.meta_store) {
|
||||
Ok(AllocateTask::None) => break,
|
||||
Ok(_) => {
|
||||
finish += 1;
|
||||
continue;
|
||||
}
|
||||
Err(_) => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
finish
|
||||
}
|
||||
|
||||
pub fn used_size(&self) -> UsedSize {
|
||||
self.vec
|
||||
.iter()
|
||||
.map(|allocator| allocator.counter.used_size())
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn get_allocate_tasks(&self, max_reserved: u64) -> tinyvec::ArrayVec<[GroupId; 3]> {
|
||||
self.vec
|
||||
.iter()
|
||||
.filter_map(|allocator| allocator.get_compact_task(max_reserved))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn finish_compact_task(&self, group_id: GroupId) {
|
||||
self.select_by_pos(Position::new(group_id, 0))
|
||||
.unwrap()
|
||||
.finish_compact_task(group_id);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_allocators() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path();
|
||||
|
||||
let meta_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: path.join("meta"),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let meta_store = Arc::new(MetaStore::open(&meta_config).unwrap());
|
||||
let allocators = Allocators::new(path, true, meta_store).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
allocators
|
||||
.select_by_pos(Position::new(GroupId::new(CHUNK_SIZE_NORMAL, 0, 0), 0))
|
||||
.unwrap()
|
||||
.clusters
|
||||
.chunk_size,
|
||||
CHUNK_SIZE_NORMAL
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
allocators
|
||||
.select_by_size(CHUNK_SIZE_SMALL)
|
||||
.unwrap()
|
||||
.clusters
|
||||
.chunk_size,
|
||||
CHUNK_SIZE_SMALL
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
allocators
|
||||
.select_by_size(CHUNK_SIZE_SMALL + 1)
|
||||
.unwrap()
|
||||
.clusters
|
||||
.chunk_size,
|
||||
CHUNK_SIZE_SMALL * 2,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
allocators
|
||||
.select_by_size(CHUNK_SIZE_NORMAL)
|
||||
.unwrap()
|
||||
.clusters
|
||||
.chunk_size,
|
||||
CHUNK_SIZE_NORMAL
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
allocators
|
||||
.select_by_size(CHUNK_SIZE_NORMAL + 1)
|
||||
.unwrap()
|
||||
.clusters
|
||||
.chunk_size,
|
||||
CHUNK_SIZE_NORMAL * 2,
|
||||
);
|
||||
|
||||
let used_size = allocators.used_size();
|
||||
assert_eq!(used_size.allocated_size, 0);
|
||||
assert_eq!(used_size.reserved_size, 0);
|
||||
|
||||
assert!(allocators
|
||||
.select_by_pos(Position::new(GroupId::new(CHUNK_SIZE_ULTRA, 0, 0), 0))
|
||||
.is_ok());
|
||||
assert!(allocators
|
||||
.select_by_pos(Position::new(GroupId::new(Size::gibibyte(1), 0, 0), 0))
|
||||
.is_err());
|
||||
assert!(allocators.select_by_size(Size::gibibyte(1)).is_err());
|
||||
}
|
||||
}
|
||||
312
src/storage/chunk_engine/src/alloc/chunk.rs
Normal file
312
src/storage/chunk_engine/src/alloc/chunk.rs
Normal file
@@ -0,0 +1,312 @@
|
||||
use super::super::*;
|
||||
use lazy_static::lazy_static;
|
||||
use rand::Rng;
|
||||
use std::cell::RefCell;
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct Chunk {
|
||||
meta: ChunkMeta,
|
||||
allocator: Arc<Allocator>,
|
||||
}
|
||||
|
||||
pub type ChunkArc = Arc<Chunk>;
|
||||
|
||||
lazy_static! {
|
||||
static ref ZERO: Vec<u8> = {
|
||||
let mut vec = create_aligned_vec(CHUNK_SIZE_ULTRA);
|
||||
vec.fill(0);
|
||||
vec
|
||||
};
|
||||
}
|
||||
|
||||
impl Chunk {
|
||||
thread_local! {
|
||||
static BUFFER: RefCell<Vec<u8>> = RefCell::new(create_aligned_vec(CHUNK_SIZE_ULTRA));
|
||||
}
|
||||
|
||||
pub fn new(meta: ChunkMeta, allocator: Arc<Allocator>) -> Self {
|
||||
Self { meta, allocator }
|
||||
}
|
||||
|
||||
pub fn meta(&self) -> &ChunkMeta {
|
||||
&self.meta
|
||||
}
|
||||
|
||||
pub fn capacity(&self) -> u32 {
|
||||
self.meta.pos.chunk_size().into()
|
||||
}
|
||||
|
||||
pub fn update_meta(&mut self, req: &UpdateReq) {
|
||||
self.meta.chunk_ver = req.out_commit_ver;
|
||||
self.meta.chain_ver = req.chain_ver;
|
||||
self.meta.last_request_id = req.last_request_id;
|
||||
self.meta.last_client_low = req.last_client_low;
|
||||
self.meta.last_client_high = req.last_client_high;
|
||||
if req.desired_tag.is_empty() {
|
||||
let r: u64 = rand::thread_rng().gen();
|
||||
self.meta.etag = ETag::from(format!("{:X}", r).as_bytes());
|
||||
} else {
|
||||
self.meta.etag = req.desired_tag.into();
|
||||
}
|
||||
self.meta.uncommitted = true;
|
||||
self.meta.timestamp = ChunkMeta::now();
|
||||
}
|
||||
|
||||
pub fn set_chain_ver(&mut self, chain_ver: u32) {
|
||||
self.meta.chain_ver = chain_ver;
|
||||
}
|
||||
|
||||
pub fn set_committed(&mut self) {
|
||||
self.meta.uncommitted = false;
|
||||
}
|
||||
|
||||
pub fn copy_chunk(&self) -> Result<Chunk> {
|
||||
// 1. allocate new chunk.
|
||||
let mut new_chunk = self.allocator.allocate(true)?;
|
||||
|
||||
// 2. copy meta.
|
||||
new_chunk.meta = ChunkMeta {
|
||||
pos: new_chunk.meta.pos,
|
||||
etag: Default::default(),
|
||||
..self.meta
|
||||
};
|
||||
|
||||
// 3. copy data.
|
||||
Self::BUFFER.with(|v| {
|
||||
let mut vec = v.borrow_mut();
|
||||
let len = self.meta.len.next_multiple_of(ALIGN_SIZE.into());
|
||||
let buf = &mut vec[..len as usize]; // aligned.
|
||||
self.pread(buf, 0)?;
|
||||
new_chunk.pwrite(buf, 0)?;
|
||||
Result::Ok(())
|
||||
})?;
|
||||
|
||||
Ok(new_chunk)
|
||||
}
|
||||
|
||||
pub fn copy_on_write(
|
||||
&self,
|
||||
data: &[u8],
|
||||
offset: u32,
|
||||
checksum: u32,
|
||||
is_syncing: bool,
|
||||
allow_to_allocate: bool,
|
||||
allocators: &Allocators,
|
||||
metrics: &Metrics,
|
||||
) -> Result<Chunk> {
|
||||
// 1. allocate new chunk.
|
||||
let new_len = std::cmp::max(self.meta.len, offset + data.len() as u32);
|
||||
let begin = std::time::Instant::now();
|
||||
let mut new_chunk = allocators.allocate(Size::from(new_len), allow_to_allocate)?;
|
||||
let begin2 = std::time::Instant::now();
|
||||
let latency = begin2.duration_since(begin).as_micros() as _;
|
||||
metrics.allocate_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics
|
||||
.allocate_latency
|
||||
.fetch_add(latency, Ordering::AcqRel);
|
||||
metrics.copy_on_write_times.fetch_add(1, Ordering::AcqRel);
|
||||
|
||||
// 2. write data.
|
||||
let skip_read = is_syncing || (offset == 0 && data.len() >= self.meta.len as usize);
|
||||
let checksum = Self::BUFFER.with(|v| {
|
||||
let mut vec = v.borrow_mut();
|
||||
if !skip_read {
|
||||
// aligned read.
|
||||
let len = self.meta.len.next_multiple_of(ALIGN_SIZE.into());
|
||||
let begin = std::time::Instant::now();
|
||||
self.pread(&mut vec[..len as usize], 0)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
|
||||
metrics
|
||||
.copy_on_write_read_times
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics
|
||||
.copy_on_write_read_bytes
|
||||
.fetch_add(len as _, Ordering::AcqRel);
|
||||
metrics
|
||||
.copy_on_write_read_latency
|
||||
.fetch_add(latency, Ordering::AcqRel);
|
||||
}
|
||||
|
||||
// aligned write.
|
||||
if skip_read && is_aligned_io(data, offset) {
|
||||
let begin = std::time::Instant::now();
|
||||
new_chunk.pwrite(data, offset)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
|
||||
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
|
||||
} else {
|
||||
if self.meta.len < offset {
|
||||
vec[self.meta.len as usize..offset as usize].fill(0);
|
||||
}
|
||||
vec[offset as usize..][..data.len()].copy_from_slice(data);
|
||||
let len = new_len.next_multiple_of(ALIGN_SIZE.into());
|
||||
let begin = std::time::Instant::now();
|
||||
new_chunk.pwrite(&vec[..len as usize], 0)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
|
||||
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
|
||||
};
|
||||
|
||||
Result::Ok(if skip_read {
|
||||
metrics.checksum_reuse.fetch_add(1, Ordering::AcqRel);
|
||||
checksum
|
||||
} else {
|
||||
metrics.checksum_recalculate.fetch_add(1, Ordering::AcqRel);
|
||||
crc32c::crc32c(&vec[..new_len as usize])
|
||||
})
|
||||
})?;
|
||||
let latency = std::time::Instant::now().duration_since(begin2).as_micros() as _;
|
||||
metrics
|
||||
.copy_on_write_latency
|
||||
.fetch_add(latency, Ordering::AcqRel);
|
||||
|
||||
// 3. copy meta.
|
||||
new_chunk.meta.len = if is_syncing {
|
||||
offset + data.len() as u32
|
||||
} else {
|
||||
new_len
|
||||
};
|
||||
new_chunk.meta.checksum = checksum;
|
||||
|
||||
Ok(new_chunk)
|
||||
}
|
||||
|
||||
pub fn safe_write(
|
||||
&mut self,
|
||||
data: &[u8],
|
||||
offset: u32,
|
||||
checksum: u32,
|
||||
truncate: bool,
|
||||
metrics: &Metrics,
|
||||
) -> Result<()> {
|
||||
if truncate && offset < self.meta.len {
|
||||
metrics
|
||||
.safe_write_truncate_shorten
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.checksum_recalculate.fetch_add(1, Ordering::AcqRel);
|
||||
return Self::BUFFER.with(|v| {
|
||||
// aligned read.
|
||||
let mut vec = v.borrow_mut();
|
||||
let len = offset.next_multiple_of(ALIGN_SIZE.into());
|
||||
self.pread(&mut vec[..len as usize], 0)?;
|
||||
self.meta.len = offset;
|
||||
self.meta.checksum = crc32c::crc32c(&vec[..offset as usize]);
|
||||
Result::Ok(())
|
||||
});
|
||||
}
|
||||
|
||||
if is_aligned_len(self.meta.len)
|
||||
&& is_aligned_len(offset)
|
||||
&& (data.is_empty() || is_aligned_buf(data))
|
||||
{
|
||||
// already aligned.
|
||||
if offset > self.meta.len {
|
||||
let padding = (offset - self.meta.len) as usize;
|
||||
let begin = std::time::Instant::now();
|
||||
self.pwrite(&ZERO[..padding], self.meta.len)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
|
||||
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
|
||||
self.meta.len = offset;
|
||||
self.meta.checksum = crc32c::crc32c_append(self.meta.checksum, &ZERO[..padding]);
|
||||
metrics
|
||||
.safe_write_truncate_extend
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
|
||||
if !data.is_empty() {
|
||||
assert!(offset == self.meta.len);
|
||||
let begin = std::time::Instant::now();
|
||||
self.pwrite(data, offset)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as u64;
|
||||
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
|
||||
self.meta.len = offset + data.len() as u32;
|
||||
self.meta.checksum =
|
||||
crc32c::crc32c_combine(self.meta.checksum, checksum, data.len());
|
||||
metrics
|
||||
.safe_write_direct_append
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
} else if self.meta.len < offset + data.len() as u32 {
|
||||
// copy to buffer and write.
|
||||
assert!(self.meta.len <= offset);
|
||||
Self::BUFFER.with(|v| {
|
||||
let mut vec = v.borrow_mut();
|
||||
let start = self.meta.len & !(ALIGN_SIZE.0 as u32 - 1);
|
||||
if start != self.meta.len {
|
||||
metrics
|
||||
.safe_write_read_tail_times
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics
|
||||
.safe_write_read_tail_bytes
|
||||
.fetch_add(ALIGN_SIZE.0, Ordering::AcqRel);
|
||||
self.pread(&mut vec[start as usize..][..ALIGN_SIZE.into()], start)?;
|
||||
}
|
||||
if self.meta.len < offset {
|
||||
metrics
|
||||
.safe_write_truncate_extend
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
vec[self.meta.len as usize..offset as usize].fill(0);
|
||||
}
|
||||
vec[offset as usize..][..data.len()].copy_from_slice(data);
|
||||
let new_len = offset as usize + data.len();
|
||||
let begin = std::time::Instant::now();
|
||||
self.pwrite(
|
||||
&vec[start as usize..new_len.next_multiple_of(ALIGN_SIZE.into())],
|
||||
start,
|
||||
)?;
|
||||
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
|
||||
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
|
||||
self.meta.checksum = crc32c::crc32c_append(
|
||||
self.meta.checksum,
|
||||
&vec[self.meta.len as usize..new_len],
|
||||
);
|
||||
metrics
|
||||
.safe_write_indirect_append
|
||||
.fetch_add(1, Ordering::AcqRel);
|
||||
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
|
||||
self.meta.len = new_len as u32;
|
||||
Result::Ok(())
|
||||
})?;
|
||||
} else {
|
||||
assert!(data.is_empty());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn pread(&self, buf: &mut [u8], offset: u32) -> Result<()> {
|
||||
self.allocator.clusters.pread(self.meta.pos, buf, offset)
|
||||
}
|
||||
|
||||
pub(super) fn pwrite(&self, buf: &[u8], offset: u32) -> Result<()> {
|
||||
self.allocator.clusters.pwrite(self.meta.pos, buf, offset)
|
||||
}
|
||||
|
||||
pub fn fd_and_offset(&self) -> FdAndOffset {
|
||||
self.allocator.clusters.fd_and_offset(self.meta.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Chunk {
|
||||
fn clone(&self) -> Self {
|
||||
self.allocator.reference(self.meta.clone(), false)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Chunk {
|
||||
fn drop(&mut self) {
|
||||
self.allocator.dereference(self.meta.pos);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Chunk {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(&self.meta, f)
|
||||
}
|
||||
}
|
||||
304
src/storage/chunk_engine/src/alloc/chunk_allocator.rs
Normal file
304
src/storage/chunk_engine/src/alloc/chunk_allocator.rs
Normal file
@@ -0,0 +1,304 @@
|
||||
use super::super::*;
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct ChunkAllocator {
|
||||
pub full_groups: ShardsSet<GroupId>,
|
||||
pub active_groups: ShardsMap<GroupId, GroupState>,
|
||||
pub(super) active_levels: [ShardsSet<GroupId>; GroupState::LEVELS],
|
||||
pub(super) frozen_groups: ShardsMap<GroupId, GroupState>,
|
||||
pub(super) group_allocator: GroupAllocator,
|
||||
pub(super) position_rc: ShardsMap<Position, u32>,
|
||||
pub(super) counter: Arc<AllocatorCounter>,
|
||||
}
|
||||
|
||||
impl ChunkAllocator {
|
||||
pub fn with_chunk_size(chunk_size: Size) -> Self {
|
||||
let counter = Arc::new(AllocatorCounter::new(chunk_size));
|
||||
Self {
|
||||
full_groups: Default::default(),
|
||||
active_groups: Default::default(),
|
||||
active_levels: Default::default(),
|
||||
frozen_groups: Default::default(),
|
||||
group_allocator: GroupAllocator::init(counter.clone()),
|
||||
position_rc: Default::default(),
|
||||
counter,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn load(
|
||||
mut it: RocksDBIterator,
|
||||
counter: Arc<AllocatorCounter>,
|
||||
chunk_size: Size,
|
||||
) -> Result<Self> {
|
||||
let mut full_groups = ShardsSet::with_capacity(4096);
|
||||
let mut active_groups = ShardsMap::with_capacity(4096);
|
||||
let frozen_groups = ShardsMap::with_capacity(4096);
|
||||
let mut active_levels = std::array::from_fn(|_| ShardsSet::with_capacity(4096));
|
||||
|
||||
let mut allocated_groups = ShardsSet::with_capacity(4096);
|
||||
let mut unallocated_groups = ShardsSet::with_capacity(4096);
|
||||
let mut current = GroupId::new(chunk_size, 0, 0);
|
||||
|
||||
let mut allocated_count: u64 = 0;
|
||||
let mut reserved_count: u64 = 0;
|
||||
|
||||
let prefix = MetaKey::group_bits_chunk_size_prefix(current);
|
||||
it.iterate(prefix, |key, value| {
|
||||
let group_id = MetaKey::parse_group_bits_key(key)?;
|
||||
let group_state = GroupState::from(value)?;
|
||||
|
||||
assert!(
|
||||
current <= group_id,
|
||||
"current {current:?} > next {group_id:?}"
|
||||
);
|
||||
while current < group_id {
|
||||
unallocated_groups.insert(current);
|
||||
current.next();
|
||||
}
|
||||
current.next();
|
||||
|
||||
allocated_count += GroupState::TOTAL_BITS as u64;
|
||||
if group_state.is_empty() {
|
||||
allocated_groups.insert(group_id);
|
||||
reserved_count += GroupState::TOTAL_BITS as u64;
|
||||
} else if group_state.is_full() {
|
||||
full_groups.insert(group_id);
|
||||
} else {
|
||||
reserved_count += GroupState::TOTAL_BITS as u64 - group_state.count() as u64;
|
||||
active_levels[group_state.level() as usize].insert(group_id);
|
||||
active_groups.insert(group_id, group_state);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
counter.init(allocated_count, reserved_count);
|
||||
let chunk_allocator = ChunkAllocator {
|
||||
full_groups,
|
||||
active_groups,
|
||||
active_levels,
|
||||
frozen_groups,
|
||||
counter: counter.clone(),
|
||||
group_allocator: GroupAllocator {
|
||||
allocated_groups,
|
||||
unallocated_groups,
|
||||
next_group_id: current,
|
||||
counter,
|
||||
},
|
||||
position_rc: ShardsMap::with_capacity(1 << 20),
|
||||
};
|
||||
|
||||
Ok(chunk_allocator)
|
||||
}
|
||||
|
||||
pub fn allocate(&mut self, clusters: &Clusters, allow_to_allocate: bool) -> Result<Position> {
|
||||
if !self.active_groups.is_empty() {
|
||||
for level in (0..GroupState::LEVELS).rev() {
|
||||
let set = &mut self.active_levels[level];
|
||||
if let Some(&group_id) = set.iter().next() {
|
||||
let state = self.active_groups.get_mut(&group_id).unwrap();
|
||||
let index = state.allocate().unwrap();
|
||||
if state.is_full() {
|
||||
self.full_groups.insert(group_id);
|
||||
self.active_groups.remove(&group_id);
|
||||
set.remove(&group_id);
|
||||
} else if state.level() != level as u32 {
|
||||
set.remove(&group_id);
|
||||
self.active_levels[level + 1].insert(group_id);
|
||||
}
|
||||
let pos = Position::new(group_id, index);
|
||||
self.reference(pos, true);
|
||||
self.counter.allocate_chunk();
|
||||
return Ok(pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let group_id = self.group_allocator.allocate(clusters, allow_to_allocate)?;
|
||||
self.counter.allocate_chunk();
|
||||
let state = match self.active_groups.entry(group_id) {
|
||||
Entry::Occupied(_) => panic!("should not be active groups: {:?}", group_id),
|
||||
Entry::Vacant(entry) => entry.insert(GroupState::empty()),
|
||||
};
|
||||
let index = state.allocate().unwrap();
|
||||
self.active_levels[state.level() as usize].insert(group_id);
|
||||
let pos = Position::new(group_id, index);
|
||||
self.reference(pos, true);
|
||||
Ok(pos)
|
||||
}
|
||||
|
||||
pub fn reference(&mut self, pos: Position, first_ref: bool) {
|
||||
let group_id = pos.group_id();
|
||||
if let Some(state) = self.active_groups.get_mut(&group_id) {
|
||||
assert!(state.check(pos.index()), "ref pos failed: {:?}", pos);
|
||||
} else if let Some(state) = self.frozen_groups.get_mut(&group_id) {
|
||||
assert!(state.check(pos.index()), "ref pos failed: {:?}", pos);
|
||||
} else {
|
||||
assert!(self.full_groups.contains(&group_id));
|
||||
}
|
||||
|
||||
let rc = match self.position_rc.entry(pos) {
|
||||
Entry::Occupied(mut occupied_entry) => {
|
||||
let rc = occupied_entry.get_mut();
|
||||
*rc += 1;
|
||||
*rc
|
||||
}
|
||||
Entry::Vacant(vacant_entry) => {
|
||||
self.counter.position_count.fetch_add(1, Ordering::AcqRel);
|
||||
vacant_entry.insert(1);
|
||||
1
|
||||
}
|
||||
};
|
||||
self.counter.position_rc.fetch_add(1, Ordering::AcqRel);
|
||||
|
||||
if first_ref {
|
||||
assert!(rc == 1, "should be first ref to pos {:?}, rc {}", pos, rc);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dereference(&mut self, pos: Position) {
|
||||
self.counter.position_rc.fetch_sub(1, Ordering::AcqRel);
|
||||
let count = self.position_rc.get_mut(&pos).unwrap();
|
||||
*count -= 1;
|
||||
if *count == 0 {
|
||||
self.counter.position_count.fetch_sub(1, Ordering::AcqRel);
|
||||
self.position_rc.remove(&pos);
|
||||
self.deallocate(pos);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn deallocate(&mut self, pos: Position) {
|
||||
let group_id = pos.group_id();
|
||||
if let Some(state) = self.active_groups.get_mut(&group_id) {
|
||||
let level = state.level();
|
||||
state.deallocate(pos.index()).unwrap();
|
||||
if state.is_empty() {
|
||||
self.active_groups.remove(&group_id);
|
||||
self.active_levels[level as usize].remove(&group_id);
|
||||
self.group_allocator.deallocate(group_id);
|
||||
} else if state.level() != level {
|
||||
self.active_levels[level as usize].remove(&group_id);
|
||||
self.active_levels[level as usize - 1].insert(group_id);
|
||||
}
|
||||
} else if let Some(state) = self.frozen_groups.get_mut(&group_id) {
|
||||
state.deallocate(pos.index()).unwrap();
|
||||
if state.is_empty() {
|
||||
self.frozen_groups.remove(&group_id);
|
||||
self.group_allocator.deallocate(group_id);
|
||||
}
|
||||
} else if self.full_groups.contains(&group_id) {
|
||||
let mut state = GroupState::full();
|
||||
state.deallocate(pos.index()).unwrap();
|
||||
self.active_levels[state.level() as usize].insert(group_id);
|
||||
self.active_groups.insert(group_id, state);
|
||||
self.full_groups.remove(&group_id);
|
||||
} else {
|
||||
unreachable!(
|
||||
"deallocate position failed! not found this position: {:?}",
|
||||
pos
|
||||
);
|
||||
}
|
||||
self.counter.deallocate_chunk();
|
||||
}
|
||||
|
||||
pub fn get_compact_task(&mut self, max_reserved: u64) -> Option<GroupId> {
|
||||
let reserved = self.counter.reserved_chunks();
|
||||
if reserved <= max_reserved {
|
||||
return None;
|
||||
}
|
||||
|
||||
for set in &mut self.active_levels {
|
||||
if let Some(&group_id) = set.iter().next() {
|
||||
set.remove(&group_id);
|
||||
let state = self.active_groups.remove(&group_id).unwrap();
|
||||
self.frozen_groups.insert(group_id, state);
|
||||
return Some(group_id);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub fn finish_compact_task(&mut self, group_id: GroupId) {
|
||||
if let Some(state) = self.frozen_groups.remove(&group_id) {
|
||||
self.active_levels[state.level() as usize].insert(group_id);
|
||||
self.active_groups.insert(group_id, state);
|
||||
tracing::info!("finish compact task and move back {:?}", group_id);
|
||||
} else {
|
||||
tracing::info!("finish compact task successful!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_chunk_allocator() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
|
||||
let clusters = Clusters::open(&config).unwrap();
|
||||
let mut chunk_allocator = ChunkAllocator::with_chunk_size(CHUNK_SIZE_NORMAL);
|
||||
assert!(chunk_allocator.active_groups.is_empty());
|
||||
assert!(chunk_allocator
|
||||
.active_levels
|
||||
.iter()
|
||||
.all(|set| set.is_empty()));
|
||||
assert!(chunk_allocator.full_groups.is_empty());
|
||||
|
||||
let one_level_count = GroupState::TOTAL_BITS / GroupState::LEVELS;
|
||||
for i in 0..(one_level_count - 1) {
|
||||
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(pos, Position::new(GroupId::default(), i as _));
|
||||
}
|
||||
assert_eq!(chunk_allocator.active_groups.len(), 1);
|
||||
assert_eq!(chunk_allocator.active_levels[0].len(), 1);
|
||||
|
||||
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(
|
||||
pos,
|
||||
Position::new(GroupId::default(), one_level_count as u8 - 1)
|
||||
);
|
||||
assert_eq!(chunk_allocator.active_groups.len(), 1);
|
||||
assert_eq!(chunk_allocator.active_levels[0].len(), 0);
|
||||
assert_eq!(chunk_allocator.active_levels[1].len(), 1);
|
||||
|
||||
let used_size = chunk_allocator.counter.used_size();
|
||||
assert_eq!(
|
||||
used_size.allocated_size,
|
||||
CHUNK_SIZE_NORMAL * GroupState::TOTAL_BITS
|
||||
);
|
||||
assert_eq!(
|
||||
used_size.reserved_size,
|
||||
CHUNK_SIZE_NORMAL * (GroupState::TOTAL_BITS - one_level_count)
|
||||
);
|
||||
|
||||
for i in one_level_count..GroupState::TOTAL_BITS {
|
||||
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(pos, Position::new(GroupId::default(), i as _));
|
||||
}
|
||||
assert!(chunk_allocator.active_groups.is_empty());
|
||||
assert!(chunk_allocator
|
||||
.active_levels
|
||||
.iter()
|
||||
.all(|set| set.is_empty()));
|
||||
assert_eq!(chunk_allocator.full_groups.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "not found this position")]
|
||||
fn test_chunk_invalid_deallocate() {
|
||||
let mut allocator = ChunkAllocator::with_chunk_size(CHUNK_SIZE_NORMAL);
|
||||
allocator.deallocate(Position::default());
|
||||
}
|
||||
}
|
||||
190
src/storage/chunk_engine/src/alloc/group_allocator.rs
Normal file
190
src/storage/chunk_engine/src/alloc/group_allocator.rs
Normal file
@@ -0,0 +1,190 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::super::*;
|
||||
|
||||
pub struct GroupAllocator {
|
||||
pub(super) allocated_groups: ShardsSet<GroupId>,
|
||||
pub(super) unallocated_groups: ShardsSet<GroupId>,
|
||||
pub(super) next_group_id: GroupId,
|
||||
pub(super) counter: Arc<AllocatorCounter>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum AllocateTask {
|
||||
None,
|
||||
Allocate(GroupId),
|
||||
Deallocate(GroupId),
|
||||
}
|
||||
|
||||
impl GroupAllocator {
|
||||
pub fn init(counter: Arc<AllocatorCounter>) -> Self {
|
||||
Self {
|
||||
allocated_groups: Default::default(),
|
||||
unallocated_groups: Default::default(),
|
||||
next_group_id: Default::default(),
|
||||
counter,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate(&mut self, clusters: &Clusters, allow_to_allocate: bool) -> Result<GroupId> {
|
||||
if let Some(&group_id) = self.allocated_groups.iter().next() {
|
||||
self.allocated_groups.remove(&group_id);
|
||||
Ok(group_id)
|
||||
} else if allow_to_allocate {
|
||||
let group_id = self.get_unallocated_group_id();
|
||||
tracing::info!("allocate group slow path {:?}", group_id);
|
||||
let result = clusters.allocate(group_id);
|
||||
if let Err(err) = result {
|
||||
self.unallocated_groups.insert(group_id);
|
||||
return Err(err);
|
||||
}
|
||||
self.counter.allocate_group();
|
||||
Ok(group_id)
|
||||
} else {
|
||||
Err(Error::NoSpace)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn deallocate(&mut self, group_id: GroupId) {
|
||||
self.allocated_groups.insert(group_id);
|
||||
}
|
||||
|
||||
fn get_unallocated_group_id(&mut self) -> GroupId {
|
||||
if let Some(&group_id) = self.unallocated_groups.iter().next() {
|
||||
self.unallocated_groups.remove(&group_id);
|
||||
group_id
|
||||
} else {
|
||||
let group_id = self.next_group_id;
|
||||
self.next_group_id = self.next_group_id.plus_one();
|
||||
group_id
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_allocate_task(&mut self, min_remain: usize, max_remain: usize) -> AllocateTask {
|
||||
if self.allocated_groups.len() < min_remain {
|
||||
AllocateTask::Allocate(self.get_unallocated_group_id())
|
||||
} else if self.allocated_groups.len() > max_remain {
|
||||
let group_id = *self.allocated_groups.iter().next().unwrap();
|
||||
self.allocated_groups.remove(&group_id);
|
||||
AllocateTask::Deallocate(group_id)
|
||||
} else {
|
||||
AllocateTask::None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finish_allocate_task(&mut self, task: AllocateTask, succ: bool) {
|
||||
match (task, succ) {
|
||||
(AllocateTask::Allocate(group_id), true) => {
|
||||
self.counter.allocate_group();
|
||||
self.allocated_groups.insert(group_id)
|
||||
}
|
||||
(AllocateTask::Deallocate(group_id), true) => {
|
||||
self.counter.deallocate_group();
|
||||
self.unallocated_groups.insert(group_id)
|
||||
}
|
||||
(AllocateTask::Allocate(group_id), false) => self.unallocated_groups.insert(group_id),
|
||||
(AllocateTask::Deallocate(group_id), false) => self.allocated_groups.insert(group_id),
|
||||
_ => false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_group_allocator() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
|
||||
let clusters = Clusters::open(&config).unwrap();
|
||||
let counter = Arc::new(AllocatorCounter::new(CHUNK_SIZE_NORMAL));
|
||||
let mut group_allocator = GroupAllocator::init(counter);
|
||||
|
||||
let group_id_1 = group_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(group_id_1, GroupId::default());
|
||||
assert_eq!(group_allocator.next_group_id, group_id_1.plus_one());
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
|
||||
let group_id_2 = group_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(group_id_1.plus_one(), group_id_2);
|
||||
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
|
||||
group_allocator.deallocate(group_id_1);
|
||||
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 1);
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
|
||||
let group_id_3 = group_allocator.allocate(&clusters, true).unwrap();
|
||||
assert_eq!(group_id_1, group_id_3);
|
||||
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
|
||||
group_allocator.allocate(&clusters, false).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_group_allocator_task() {
|
||||
let counter = Arc::new(AllocatorCounter::new(CHUNK_SIZE_NORMAL));
|
||||
let mut group_allocator = GroupAllocator::init(counter);
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 0);
|
||||
|
||||
let task = group_allocator.get_allocate_task(2, 4);
|
||||
assert!(matches!(task, AllocateTask::Allocate(_)));
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 1);
|
||||
|
||||
group_allocator.finish_allocate_task(task, false);
|
||||
let task = group_allocator.get_allocate_task(2, 4);
|
||||
assert!(matches!(task, AllocateTask::Allocate(_)));
|
||||
assert!(group_allocator.allocated_groups.is_empty());
|
||||
assert!(group_allocator.unallocated_groups.is_empty());
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 1);
|
||||
|
||||
group_allocator.finish_allocate_task(task, true);
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 1);
|
||||
assert_eq!(group_allocator.unallocated_groups.len(), 0);
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 1);
|
||||
|
||||
let task = group_allocator.get_allocate_task(2, 4);
|
||||
assert!(matches!(task, AllocateTask::Allocate(_)));
|
||||
group_allocator.finish_allocate_task(task, true);
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 2);
|
||||
assert_eq!(group_allocator.unallocated_groups.len(), 0);
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 2);
|
||||
|
||||
let task = group_allocator.get_allocate_task(2, 4);
|
||||
assert!(matches!(task, AllocateTask::None));
|
||||
group_allocator.finish_allocate_task(task, true);
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 2);
|
||||
assert_eq!(group_allocator.unallocated_groups.len(), 0);
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 2);
|
||||
|
||||
let task = group_allocator.get_allocate_task(3, 4);
|
||||
assert!(matches!(task, AllocateTask::Allocate(_)));
|
||||
group_allocator.finish_allocate_task(task, false);
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 2);
|
||||
assert_eq!(group_allocator.unallocated_groups.len(), 1);
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 3);
|
||||
|
||||
let task = group_allocator.get_allocate_task(1, 1);
|
||||
assert!(matches!(task, AllocateTask::Deallocate(_)));
|
||||
group_allocator.finish_allocate_task(task, false);
|
||||
assert_eq!(group_allocator.allocated_groups.len(), 2);
|
||||
assert_eq!(group_allocator.unallocated_groups.len(), 1);
|
||||
assert_eq!(group_allocator.next_group_id.cluster(), 3);
|
||||
}
|
||||
}
|
||||
27
src/storage/chunk_engine/src/alloc/metrics.rs
Normal file
27
src/storage/chunk_engine/src/alloc/metrics.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use std::sync::atomic::AtomicU64;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[repr(C)]
|
||||
pub struct Metrics {
|
||||
pub copy_on_write_times: AtomicU64,
|
||||
pub copy_on_write_latency: AtomicU64,
|
||||
pub copy_on_write_read_bytes: AtomicU64,
|
||||
pub copy_on_write_read_times: AtomicU64,
|
||||
pub copy_on_write_read_latency: AtomicU64,
|
||||
|
||||
pub checksum_reuse: AtomicU64,
|
||||
pub checksum_combine: AtomicU64,
|
||||
pub checksum_recalculate: AtomicU64,
|
||||
|
||||
pub safe_write_direct_append: AtomicU64,
|
||||
pub safe_write_indirect_append: AtomicU64,
|
||||
pub safe_write_truncate_shorten: AtomicU64,
|
||||
pub safe_write_truncate_extend: AtomicU64,
|
||||
pub safe_write_read_tail_times: AtomicU64,
|
||||
pub safe_write_read_tail_bytes: AtomicU64,
|
||||
|
||||
pub allocate_times: AtomicU64,
|
||||
pub allocate_latency: AtomicU64,
|
||||
pub pwrite_times: AtomicU64,
|
||||
pub pwrite_latency: AtomicU64,
|
||||
}
|
||||
17
src/storage/chunk_engine/src/alloc/mod.rs
Normal file
17
src/storage/chunk_engine/src/alloc/mod.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
mod allocator;
|
||||
mod allocator_counter;
|
||||
mod allocators;
|
||||
mod chunk;
|
||||
mod chunk_allocator;
|
||||
mod group_allocator;
|
||||
mod metrics;
|
||||
mod writing_chunk;
|
||||
|
||||
pub use allocator::*;
|
||||
pub use allocator_counter::*;
|
||||
pub use allocators::*;
|
||||
pub use chunk::*;
|
||||
pub use chunk_allocator::*;
|
||||
pub use group_allocator::*;
|
||||
pub use metrics::*;
|
||||
pub use writing_chunk::*;
|
||||
124
src/storage/chunk_engine/src/alloc/writing_chunk.rs
Normal file
124
src/storage/chunk_engine/src/alloc/writing_chunk.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
use crate::{Bytes, Chunk, ChunkArc, ChunkMeta};
|
||||
use dashmap::DashMap;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
pub struct WritingHolder {
|
||||
pub chunk: Chunk,
|
||||
pub abort: bool,
|
||||
}
|
||||
|
||||
pub type WritingList = DashMap<Bytes, HashMap<Bytes, WritingHolder>>;
|
||||
|
||||
pub struct WritingChunk {
|
||||
pub chunk_id: Bytes,
|
||||
pub chunk: Chunk,
|
||||
pub list: Arc<WritingList>,
|
||||
pub prefix_len: u32,
|
||||
pub is_remove: bool,
|
||||
pub commit_succ: bool,
|
||||
}
|
||||
|
||||
impl WritingChunk {
|
||||
pub fn meta(&self) -> &ChunkMeta {
|
||||
self.chunk.meta()
|
||||
}
|
||||
|
||||
pub fn set_committed(&mut self) {
|
||||
self.chunk.set_committed();
|
||||
}
|
||||
|
||||
pub fn commit_succ(&mut self) {
|
||||
self.commit_succ = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WritingChunk {
|
||||
fn drop(&mut self) {
|
||||
let prefix = &self.chunk_id[..self.prefix_len as usize];
|
||||
if let Some(mut map) = self.list.get_mut(prefix) {
|
||||
if self.commit_succ {
|
||||
if map.remove(&self.chunk_id).is_some() {
|
||||
return;
|
||||
}
|
||||
} else if let Some(holder) = map.get_mut(&self.chunk_id) {
|
||||
holder.abort = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("chunk id {:?} is not in the writing list!", self.chunk_id);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&WritingChunk> for ChunkArc {
|
||||
fn from(chunk: &WritingChunk) -> Self {
|
||||
Arc::new(chunk.chunk.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for WritingChunk {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("WritingChunk")
|
||||
.field("chunk_id", &self.chunk_id)
|
||||
.field("chunk", &self.chunk)
|
||||
.field("is_remove", &self.is_remove)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::*;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn test_writing_chunk_not_in_list(has_list: bool, commit_succ: bool) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path();
|
||||
|
||||
let meta_config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: path.join("meta"),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let meta_store = Arc::new(MetaStore::open(&meta_config).unwrap());
|
||||
let allocators = Allocators::new(path, true, meta_store).unwrap();
|
||||
let chunk = allocators.allocate(CHUNK_SIZE_NORMAL, true).unwrap();
|
||||
|
||||
let writing_list: Arc<WritingList> = Default::default();
|
||||
if has_list {
|
||||
writing_list
|
||||
.entry(Bytes::from(b"te".as_slice()))
|
||||
.or_default();
|
||||
}
|
||||
let writing_chunk = WritingChunk {
|
||||
chunk_id: b"test".as_ref().into(),
|
||||
chunk,
|
||||
list: writing_list.clone(),
|
||||
prefix_len: 2,
|
||||
is_remove: false,
|
||||
commit_succ,
|
||||
};
|
||||
println!("{:#?}", writing_chunk);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
|
||||
fn test_writing_chunk_not_in_list_1() {
|
||||
test_writing_chunk_not_in_list(false, false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
|
||||
fn test_writing_chunk_not_in_list_2() {
|
||||
test_writing_chunk_not_in_list(true, false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
|
||||
fn test_writing_chunk_not_in_list_3() {
|
||||
test_writing_chunk_not_in_list(true, true);
|
||||
}
|
||||
}
|
||||
89
src/storage/chunk_engine/src/bin/bench.rs
Normal file
89
src/storage/chunk_engine/src/bin/bench.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chunk_engine::*;
|
||||
use serde::Deserialize;
|
||||
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
struct Config {
|
||||
engine: EngineConfig,
|
||||
threads: usize,
|
||||
count: usize,
|
||||
level: String,
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let mut iter = std::env::args();
|
||||
iter.next();
|
||||
let config_path = iter
|
||||
.next()
|
||||
.ok_or(anyhow::anyhow!("get config path failed"))?;
|
||||
|
||||
let content = std::fs::read_to_string(&config_path)
|
||||
.with_context(|| format!("failed to open config file {:?}", config_path))?;
|
||||
|
||||
let config: Config = toml::from_str(&content)
|
||||
.with_context(|| format!("failed to parse config file {:?}", config_path))?;
|
||||
|
||||
let level = match config.level.as_str() {
|
||||
"info" => tracing::Level::INFO,
|
||||
"debug" => tracing::Level::DEBUG,
|
||||
_ => tracing::Level::WARN,
|
||||
};
|
||||
tracing_subscriber::fmt().with_max_level(level).init();
|
||||
tracing::info!("config content: {:#?}", config);
|
||||
|
||||
let engine = chunk_engine::Engine::open(&config.engine).unwrap();
|
||||
engine.start_allocate_workers(2);
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
let bytes = Arc::new(AtomicUsize::default());
|
||||
let running = Arc::new(AtomicUsize::default());
|
||||
|
||||
let threads = (0..config.threads)
|
||||
.map(|i| {
|
||||
let engine = engine.clone();
|
||||
let bytes = bytes.clone();
|
||||
let running = running.clone();
|
||||
|
||||
let mut vec = create_aligned_vec(CHUNK_SIZE_NORMAL);
|
||||
vec.fill(i as u8);
|
||||
let checksum = crc32c::crc32c(&vec);
|
||||
running.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
Ok(std::thread::spawn(move || {
|
||||
let mut chunk_id: usize = i << 32;
|
||||
for _ in 0..config.count {
|
||||
engine
|
||||
.write(&chunk_id.to_be_bytes(), &vec, 0, checksum)
|
||||
.unwrap();
|
||||
chunk_id += 1;
|
||||
bytes.fetch_add(vec.len(), Ordering::SeqCst);
|
||||
}
|
||||
running.fetch_sub(1, Ordering::SeqCst);
|
||||
}))
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
while running.load(Ordering::Acquire) > 0 {
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
let bytes = bytes.swap(0, Ordering::Acquire);
|
||||
let used_size = engine.used_size();
|
||||
tracing::info!(
|
||||
"throughput: {:?}/s, allocated: {:?}, reserved: {:?}",
|
||||
Size::from(bytes),
|
||||
used_size.allocated_size,
|
||||
used_size.reserved_size,
|
||||
);
|
||||
}
|
||||
|
||||
for thread in threads {
|
||||
thread.join().unwrap();
|
||||
}
|
||||
|
||||
engine.stop_and_join();
|
||||
engine.speed_up_quit();
|
||||
Ok(())
|
||||
}
|
||||
1655
src/storage/chunk_engine/src/core/engine.rs
Normal file
1655
src/storage/chunk_engine/src/core/engine.rs
Normal file
File diff suppressed because it is too large
Load Diff
3
src/storage/chunk_engine/src/core/mod.rs
Normal file
3
src/storage/chunk_engine/src/core/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod engine;
|
||||
|
||||
pub use engine::*;
|
||||
598
src/storage/chunk_engine/src/cxx.rs
Normal file
598
src/storage/chunk_engine/src/cxx.rs
Normal file
@@ -0,0 +1,598 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::*;
|
||||
pub use ::cxx::CxxString;
|
||||
|
||||
fn create(path: &str, create: bool, prefix_len: usize, error: Pin<&mut CxxString>) -> Box<Engine> {
|
||||
let config = EngineConfig {
|
||||
path: PathBuf::from(path),
|
||||
create,
|
||||
prefix_len,
|
||||
};
|
||||
match Engine::open(&config) {
|
||||
Ok(engine) => Box::new(engine),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
unsafe { Box::from_raw(std::ptr::null_mut()) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LogGuard(tracing_appender::non_blocking::WorkerGuard);
|
||||
|
||||
fn init_log(path: &str, error: Pin<&mut CxxString>) -> Box<LogGuard> {
|
||||
match rolling_file::BasicRollingFileAppender::new(
|
||||
path,
|
||||
rolling_file::RollingConditionBasic::new().max_size(Size::mebibyte(500).into()),
|
||||
20,
|
||||
) {
|
||||
Ok(file_appender) => {
|
||||
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
|
||||
tracing_subscriber::fmt()
|
||||
.with_max_level(tracing::Level::INFO)
|
||||
.with_writer(non_blocking)
|
||||
.with_ansi(false)
|
||||
.init();
|
||||
Box::new(LogGuard(guard))
|
||||
}
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
unsafe { Box::from_raw(std::ptr::null_mut()) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Chunk {
|
||||
fn raw_meta(&self) -> &ffi::RawMeta {
|
||||
unsafe { std::mem::transmute(self.meta()) }
|
||||
}
|
||||
|
||||
fn raw_etag(&self) -> &[u8] {
|
||||
&self.meta().etag
|
||||
}
|
||||
|
||||
fn uncommitted(&self) -> bool {
|
||||
self.meta().uncommitted
|
||||
}
|
||||
}
|
||||
|
||||
impl WritingChunk {
|
||||
fn raw_meta(&self) -> &ffi::RawMeta {
|
||||
self.chunk.raw_meta()
|
||||
}
|
||||
|
||||
fn raw_etag(&self) -> &[u8] {
|
||||
self.chunk.raw_etag()
|
||||
}
|
||||
|
||||
fn uncommitted(&self) -> bool {
|
||||
self.chunk.uncommitted()
|
||||
}
|
||||
|
||||
fn raw_chunk(&self) -> *const Chunk {
|
||||
&self.chunk
|
||||
}
|
||||
|
||||
fn set_chain_ver(&mut self, chain_ver: u32) {
|
||||
self.chunk.set_chain_ver(chain_ver);
|
||||
}
|
||||
}
|
||||
|
||||
impl Engine {
|
||||
fn raw_used_size(&self) -> ffi::RawUsedSize {
|
||||
unsafe { std::mem::transmute(self.used_size()) }
|
||||
}
|
||||
|
||||
fn get_raw_chunk(&self, chunk_id: &[u8], error: Pin<&mut CxxString>) -> *const Chunk {
|
||||
match self.get(chunk_id) {
|
||||
Ok(None) => {
|
||||
error.clear();
|
||||
std::ptr::null()
|
||||
}
|
||||
Ok(Some(c)) => {
|
||||
error.clear();
|
||||
Arc::into_raw(c)
|
||||
}
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
std::ptr::null()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_raw_chunks(&self, reqs: &mut [GetReq], error: Pin<&mut CxxString>) {
|
||||
let chunk_ids = reqs
|
||||
.iter()
|
||||
.map(|r| Bytes::from(r.chunk_id))
|
||||
.collect::<BTreeSet<_>>();
|
||||
match self.batch_get(&chunk_ids) {
|
||||
Ok(chunks) => {
|
||||
for req in reqs {
|
||||
match chunks.get(req.chunk_id) {
|
||||
Some(c) => req.chunk_ptr = Arc::into_raw(c.clone()),
|
||||
None => req.chunk_ptr = std::ptr::null_mut(),
|
||||
}
|
||||
}
|
||||
error.clear();
|
||||
}
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn release_raw_chunk(&self, chunk: *const Chunk) {
|
||||
if !chunk.is_null() {
|
||||
Arc::from_raw(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn release_writing_chunk(&self, chunk: *mut WritingChunk) {
|
||||
if !chunk.is_null() {
|
||||
let _ = Box::from_raw(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
fn update_raw_chunk(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
mut req: Pin<&mut ffi::UpdateReq>,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> *mut WritingChunk {
|
||||
match self.update_chunk(chunk_id, &mut req) {
|
||||
Ok(chunk) => Box::into_raw(Box::new(chunk)),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
req.out_error_code = match e {
|
||||
Error::IoError(_) => 4011, // ChunkWriteFailed
|
||||
Error::RocksDBError(_) => 4003, // ChunkMetadataSetError
|
||||
Error::MetaError(_) => 4002, // ChunkMetadataGetError
|
||||
Error::InvalidArg(_) => 3, // InvalidArg
|
||||
Error::SerializationError(_) => 4002, // ChunkMetadataGetError
|
||||
Error::ChecksumMismatch(_) => 4080, // ChecksumMismatch
|
||||
Error::ChainVersionMismatch(_) => 4081, // ChainVersionMismatch
|
||||
Error::ChunkETagMismatch(_) => 4083, // ChunkETagMismatch
|
||||
Error::ChunkAlreadyExists => 4084, // ChunkAlreadyExists
|
||||
Error::ChunkCommittedUpdate(_) => 4008, // ChunkCommittedUpdate
|
||||
Error::ChunkMissingUpdate(_) => 4007, // ChunkMissingUpdate
|
||||
Error::NoSpace => 7021, // NoSpace
|
||||
};
|
||||
std::ptr::null_mut()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn commit_raw_chunk(
|
||||
&self,
|
||||
new_chunk: *mut WritingChunk,
|
||||
sync: bool,
|
||||
error: Pin<&mut CxxString>,
|
||||
) {
|
||||
let new_chunk = Box::from_raw(new_chunk);
|
||||
match self.commit_chunk(*new_chunk, sync) {
|
||||
Ok(_) => (),
|
||||
Err(e) => error.push_str(&e.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn commit_raw_chunks(
|
||||
&self,
|
||||
reqs: &[*mut WritingChunk],
|
||||
sync: bool,
|
||||
error: Pin<&mut CxxString>,
|
||||
) {
|
||||
let chunks = reqs.iter().map(|c| *Box::from_raw(*c)).collect::<Vec<_>>();
|
||||
match self.commit_chunks(chunks, sync) {
|
||||
Ok(_) => (),
|
||||
Err(e) => error.push_str(&e.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
fn query_raw_chunks(
|
||||
&self,
|
||||
begin: &[u8],
|
||||
end: &[u8],
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks> {
|
||||
match self.query_chunks(begin, end, max_count) {
|
||||
Ok(vec) => Box::new(RawChunks { vec }),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn query_all_raw_chunks(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> Box<RawChunks> {
|
||||
match self.query_all_chunks(prefix) {
|
||||
Ok(vec) => Box::new(RawChunks { vec }),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn query_raw_chunks_by_timestamp(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
begin: u64,
|
||||
end: u64,
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks> {
|
||||
match self.query_chunks_by_timestamp(prefix, begin, end, max_count) {
|
||||
Ok(vec) => Box::new(RawChunks { vec }),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn raw_batch_remove(
|
||||
&self,
|
||||
begin: &[u8],
|
||||
end: &[u8],
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> u64 {
|
||||
match self.batch_remove(begin, end, max_count) {
|
||||
Ok(cnt) => cnt,
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn query_raw_used_size(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> u64 {
|
||||
match self.meta_store.query_used_size(prefix) {
|
||||
Ok(size) => size,
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_metrics(&self) -> ffi::Metrics {
|
||||
let metrics = self.metrics.as_ref();
|
||||
let copy_on_write_times = metrics.copy_on_write_times.swap(0, Ordering::AcqRel);
|
||||
let copy_on_write_latency = metrics.copy_on_write_latency.swap(0, Ordering::AcqRel);
|
||||
let copy_on_write_read_times = metrics.copy_on_write_read_times.swap(0, Ordering::AcqRel);
|
||||
let copy_on_write_read_latency =
|
||||
metrics.copy_on_write_read_latency.swap(0, Ordering::AcqRel);
|
||||
let allocate_total_latency = metrics.allocate_latency.swap(0, Ordering::AcqRel);
|
||||
let allocate_total_times = metrics.allocate_times.swap(0, Ordering::AcqRel);
|
||||
let pwrite_total_latency = metrics.pwrite_latency.swap(0, Ordering::AcqRel);
|
||||
let pwrite_total_times = metrics.pwrite_times.swap(0, Ordering::AcqRel);
|
||||
ffi::Metrics {
|
||||
copy_on_write_times,
|
||||
copy_on_write_latency: copy_on_write_latency / std::cmp::max(1, copy_on_write_times),
|
||||
copy_on_write_read_bytes: metrics.copy_on_write_read_bytes.swap(0, Ordering::AcqRel),
|
||||
copy_on_write_read_times,
|
||||
copy_on_write_read_latency: copy_on_write_read_latency
|
||||
/ std::cmp::max(1, copy_on_write_read_times),
|
||||
checksum_reuse: metrics.checksum_reuse.swap(0, Ordering::AcqRel),
|
||||
checksum_combine: metrics.checksum_combine.swap(0, Ordering::AcqRel),
|
||||
checksum_recalculate: metrics.checksum_recalculate.swap(0, Ordering::AcqRel),
|
||||
safe_write_direct_append: metrics.safe_write_direct_append.swap(0, Ordering::AcqRel),
|
||||
safe_write_indirect_append: metrics
|
||||
.safe_write_indirect_append
|
||||
.swap(0, Ordering::AcqRel),
|
||||
safe_write_truncate_shorten: metrics
|
||||
.safe_write_truncate_shorten
|
||||
.swap(0, Ordering::AcqRel),
|
||||
safe_write_truncate_extend: metrics
|
||||
.safe_write_truncate_extend
|
||||
.swap(0, Ordering::AcqRel),
|
||||
safe_write_read_tail_times: metrics
|
||||
.safe_write_read_tail_times
|
||||
.swap(0, Ordering::AcqRel),
|
||||
safe_write_read_tail_bytes: metrics
|
||||
.safe_write_read_tail_bytes
|
||||
.swap(0, Ordering::AcqRel),
|
||||
allocate_latency: allocate_total_latency / std::cmp::max(1, allocate_total_times),
|
||||
allocate_times: allocate_total_times,
|
||||
pwrite_latency: pwrite_total_latency / std::cmp::max(1, pwrite_total_times),
|
||||
pwrite_times: pwrite_total_times,
|
||||
}
|
||||
}
|
||||
|
||||
fn query_uncommitted_raw_chunks(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks> {
|
||||
match self.query_uncommitted_chunks(prefix) {
|
||||
Ok(chunks) => Box::new(RawChunks { vec: chunks }),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_uncommitted_raw_chunks(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
chain_ver: u32,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks> {
|
||||
match self.handle_uncommitted_chunks(prefix, chain_ver) {
|
||||
Ok(chunks) => Box::new(RawChunks { vec: chunks }),
|
||||
Err(e) => {
|
||||
error.push_str(&e.to_string());
|
||||
Box::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct RawChunks {
|
||||
vec: Vec<(Bytes, ChunkMeta)>,
|
||||
}
|
||||
|
||||
impl RawChunks {
|
||||
fn len(&self) -> usize {
|
||||
self.vec.len()
|
||||
}
|
||||
|
||||
fn chunk_id(&self, pos: usize) -> &[u8] {
|
||||
self.vec[pos].0.as_ref()
|
||||
}
|
||||
|
||||
fn chunk_meta(&self, pos: usize) -> &ffi::RawMeta {
|
||||
unsafe { std::mem::transmute(&self.vec[pos].1) }
|
||||
}
|
||||
|
||||
fn chunk_etag(&self, pos: usize) -> &[u8] {
|
||||
&self.vec[pos].1.etag
|
||||
}
|
||||
|
||||
fn chunk_uncommitted(&self, pos: usize) -> bool {
|
||||
self.vec[pos].1.uncommitted
|
||||
}
|
||||
}
|
||||
|
||||
#[::cxx::bridge(namespace = "hf3fs::chunk_engine")]
|
||||
pub mod ffi {
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct UpdateReq {
|
||||
without_checksum: bool,
|
||||
is_truncate: bool,
|
||||
is_remove: bool,
|
||||
is_syncing: bool,
|
||||
update_ver: u32,
|
||||
chain_ver: u32,
|
||||
checksum: u32,
|
||||
length: u32,
|
||||
offset: u32,
|
||||
data: u64,
|
||||
last_request_id: u64,
|
||||
last_client_low: u64,
|
||||
last_client_high: u64,
|
||||
expected_tag: &'static [u8],
|
||||
desired_tag: &'static [u8],
|
||||
create_new: bool,
|
||||
|
||||
out_non_existent: bool,
|
||||
out_error_code: u16,
|
||||
out_commit_ver: u32,
|
||||
out_chain_ver: u32,
|
||||
out_checksum: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct GetReq<'a> {
|
||||
chunk_id: &'a [u8],
|
||||
chunk_ptr: *const Chunk,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct RawMeta {
|
||||
pos: u64,
|
||||
chain_ver: u32,
|
||||
chunk_ver: u32,
|
||||
len: u32,
|
||||
checksum: u32,
|
||||
timestamp: u64,
|
||||
last_request_id: u64,
|
||||
last_client_low: u64,
|
||||
last_client_high: u64,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct RawUsedSize {
|
||||
allocated_size: u64,
|
||||
reserved_size: u64,
|
||||
position_count: u64,
|
||||
position_rc: u64,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct FdAndOffset {
|
||||
fd: i32,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub struct Metrics {
|
||||
pub copy_on_write_times: u64,
|
||||
pub copy_on_write_latency: u64,
|
||||
pub copy_on_write_read_bytes: u64,
|
||||
pub copy_on_write_read_times: u64,
|
||||
pub copy_on_write_read_latency: u64,
|
||||
|
||||
pub checksum_reuse: u64,
|
||||
pub checksum_combine: u64,
|
||||
pub checksum_recalculate: u64,
|
||||
|
||||
pub safe_write_direct_append: u64,
|
||||
pub safe_write_indirect_append: u64,
|
||||
pub safe_write_truncate_shorten: u64,
|
||||
pub safe_write_truncate_extend: u64,
|
||||
pub safe_write_read_tail_times: u64,
|
||||
pub safe_write_read_tail_bytes: u64,
|
||||
|
||||
pub allocate_times: u64,
|
||||
pub allocate_latency: u64,
|
||||
pub pwrite_times: u64,
|
||||
pub pwrite_latency: u64,
|
||||
}
|
||||
|
||||
extern "Rust" {
|
||||
type Engine;
|
||||
fn create(
|
||||
path: &str,
|
||||
create: bool,
|
||||
prefix_len: usize,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<Engine>;
|
||||
|
||||
fn raw_used_size(&self) -> RawUsedSize;
|
||||
fn allocate_groups(&self, min_remain: usize, max_remain: usize, batch_size: usize)
|
||||
-> usize;
|
||||
fn allocate_ultra_groups(
|
||||
&self,
|
||||
min_remain: usize,
|
||||
max_remain: usize,
|
||||
batch_size: usize,
|
||||
) -> usize;
|
||||
fn compact_groups(&self, max_reserved: u64) -> usize;
|
||||
|
||||
fn set_allow_to_allocate(&self, val: bool);
|
||||
fn speed_up_quit(&self);
|
||||
|
||||
fn get_raw_chunk(&self, chunk_id: &[u8], error: Pin<&mut CxxString>) -> *const Chunk;
|
||||
fn get_raw_chunks(&self, reqs: &mut [GetReq], error: Pin<&mut CxxString>);
|
||||
unsafe fn release_raw_chunk(&self, chunk: *const Chunk);
|
||||
unsafe fn release_writing_chunk(&self, chunk: *mut WritingChunk);
|
||||
|
||||
fn update_raw_chunk(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
req: Pin<&mut UpdateReq>,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> *mut WritingChunk;
|
||||
|
||||
unsafe fn commit_raw_chunk(
|
||||
&self,
|
||||
new_chunk: *mut WritingChunk,
|
||||
sync: bool,
|
||||
error: Pin<&mut CxxString>,
|
||||
);
|
||||
|
||||
unsafe fn commit_raw_chunks(
|
||||
&self,
|
||||
reqs: &[*mut WritingChunk],
|
||||
sync: bool,
|
||||
error: Pin<&mut CxxString>,
|
||||
);
|
||||
|
||||
fn query_raw_chunks(
|
||||
&self,
|
||||
begin: &[u8],
|
||||
end: &[u8],
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks>;
|
||||
|
||||
fn query_all_raw_chunks(&self, prefix: &[u8], error: Pin<&mut CxxString>)
|
||||
-> Box<RawChunks>;
|
||||
|
||||
fn query_raw_chunks_by_timestamp(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
begin: u64,
|
||||
end: u64,
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks>;
|
||||
|
||||
fn raw_batch_remove(
|
||||
&self,
|
||||
begin: &[u8],
|
||||
end: &[u8],
|
||||
max_count: u64,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> u64;
|
||||
|
||||
fn query_raw_used_size(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> u64;
|
||||
|
||||
fn get_metrics(&self) -> Metrics;
|
||||
|
||||
fn query_uncommitted_raw_chunks(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks>;
|
||||
|
||||
fn handle_uncommitted_raw_chunks(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
chain_ver: u32,
|
||||
error: Pin<&mut CxxString>,
|
||||
) -> Box<RawChunks>;
|
||||
}
|
||||
|
||||
extern "Rust" {
|
||||
type LogGuard;
|
||||
fn init_log(path: &str, error: Pin<&mut CxxString>) -> Box<LogGuard>;
|
||||
}
|
||||
|
||||
extern "Rust" {
|
||||
type Chunk;
|
||||
fn raw_meta(&self) -> &RawMeta;
|
||||
fn raw_etag(&self) -> &[u8];
|
||||
fn uncommitted(&self) -> bool;
|
||||
fn fd_and_offset(&self) -> FdAndOffset;
|
||||
}
|
||||
|
||||
extern "Rust" {
|
||||
type WritingChunk;
|
||||
fn raw_meta(&self) -> &RawMeta;
|
||||
fn raw_etag(&self) -> &[u8];
|
||||
fn uncommitted(&self) -> bool;
|
||||
fn raw_chunk(&self) -> *const Chunk;
|
||||
fn set_chain_ver(&mut self, chain_ver: u32);
|
||||
}
|
||||
|
||||
extern "Rust" {
|
||||
type RawChunks;
|
||||
fn len(&self) -> usize;
|
||||
fn chunk_id(&self, pos: usize) -> &[u8];
|
||||
fn chunk_meta(&self, pos: usize) -> &RawMeta;
|
||||
fn chunk_etag(&self, pos: usize) -> &[u8];
|
||||
fn chunk_uncommitted(&self, pos: usize) -> bool;
|
||||
}
|
||||
}
|
||||
|
||||
static_assertions::const_assert_eq!(
|
||||
std::mem::align_of::<ChunkMeta>(),
|
||||
std::mem::align_of::<ffi::RawMeta>()
|
||||
);
|
||||
static_assertions::const_assert_eq!(
|
||||
std::mem::size_of::<UsedSize>(),
|
||||
std::mem::size_of::<ffi::RawUsedSize>()
|
||||
);
|
||||
static_assertions::const_assert_eq!(
|
||||
std::mem::align_of::<UsedSize>(),
|
||||
std::mem::align_of::<ffi::RawUsedSize>()
|
||||
);
|
||||
static_assertions::const_assert_eq!(
|
||||
std::mem::size_of::<Metrics>(),
|
||||
std::mem::size_of::<ffi::Metrics>()
|
||||
);
|
||||
static_assertions::const_assert_eq!(
|
||||
std::mem::align_of::<Metrics>(),
|
||||
std::mem::align_of::<ffi::Metrics>()
|
||||
);
|
||||
176
src/storage/chunk_engine/src/file/cluster.rs
Normal file
176
src/storage/chunk_engine/src/file/cluster.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
use std::fs::File;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::Path;
|
||||
use std::{fs::OpenOptions, os::unix::fs::OpenOptionsExt};
|
||||
|
||||
use super::super::*;
|
||||
|
||||
const PUNCH_HOLE_FLAGS: i32 = libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE;
|
||||
|
||||
pub struct Cluster {
|
||||
pub normal_fd: File,
|
||||
pub direct_fd: File,
|
||||
}
|
||||
|
||||
impl Cluster {
|
||||
pub fn open(path: &Path, create: bool, support_direct_io: bool) -> Result<Self> {
|
||||
let normal_fd = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(create)
|
||||
.custom_flags(libc::O_SYNC)
|
||||
.open(path)
|
||||
.map_err(|err| Error::IoError(format!("open {:?} failed: {:?}", path, err)))?;
|
||||
|
||||
let direct_fd = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.custom_flags(if support_direct_io {
|
||||
libc::O_DIRECT
|
||||
} else {
|
||||
libc::O_SYNC
|
||||
})
|
||||
.open(path)
|
||||
.map_err(|err| Error::IoError(format!("open {:?} failed: {:?}", path, err)))?;
|
||||
|
||||
Ok(Self {
|
||||
normal_fd,
|
||||
direct_fd,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fallocate(&self, group_id: GroupId, punch_hole: bool) -> Result<()> {
|
||||
let res = unsafe {
|
||||
libc::fallocate(
|
||||
self.direct_fd.as_raw_fd(),
|
||||
if punch_hole { PUNCH_HOLE_FLAGS } else { 0 },
|
||||
group_id.offset().into(),
|
||||
group_id.size().into(),
|
||||
)
|
||||
};
|
||||
if res == -1 {
|
||||
Err(Error::IoError(format!(
|
||||
"fallocate {} error: {:?}",
|
||||
self.direct_fd.as_raw_fd(),
|
||||
std::io::Error::last_os_error()
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pread(&self, pos: Position, mut buf: &mut [u8], offset: u32) -> Result<()> {
|
||||
let aligned = is_aligned_io(buf, offset);
|
||||
let mut offset = pos.offset() + offset;
|
||||
while !buf.is_empty() {
|
||||
let fd = if aligned && is_aligned_len(buf.len() as u32) {
|
||||
&self.direct_fd
|
||||
} else {
|
||||
&self.normal_fd
|
||||
};
|
||||
|
||||
match fd.read_at(buf, offset.into()) {
|
||||
Ok(0) => return Err(Error::IoError(format!("read {:?} return 0", fd))),
|
||||
Ok(n) => {
|
||||
buf = &mut buf[n..];
|
||||
offset += n;
|
||||
}
|
||||
Err(e) => Self::handle_error(e)?,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn pwrite(&self, pos: Position, mut buf: &[u8], offset: u32) -> Result<()> {
|
||||
let aligned = is_aligned_io(buf, offset);
|
||||
let mut offset = pos.offset() + offset;
|
||||
while !buf.is_empty() {
|
||||
let fd = if aligned && is_aligned_len(buf.len() as u32) {
|
||||
&self.direct_fd
|
||||
} else {
|
||||
&self.normal_fd
|
||||
};
|
||||
|
||||
match fd.write_at(buf, offset.into()) {
|
||||
Ok(0) => return Err(Error::IoError(format!("write {:?} return 0", fd))),
|
||||
Ok(n) => {
|
||||
buf = &buf[n..];
|
||||
offset += n as u64;
|
||||
}
|
||||
Err(e) => Self::handle_error(e)?,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_error(e: std::io::Error) -> Result<()> {
|
||||
if e.kind() == std::io::ErrorKind::Interrupted {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(Error::IoError(format!("io error: {:?}", e)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::os::fd::FromRawFd;
|
||||
|
||||
#[test]
|
||||
fn test_cluster_open() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let support_direct_io = FsType::check(&dir).support_direct_io();
|
||||
|
||||
for chunk_size in [CHUNK_SIZE_NORMAL, CHUNK_SIZE_SMALL, CHUNK_SIZE_LARGE] {
|
||||
let file_path = dir.path().join(format!("test.cluster.{}", chunk_size));
|
||||
assert!(Cluster::open(&file_path, false, support_direct_io).is_err());
|
||||
|
||||
let cluster = Cluster::open(&file_path, true, support_direct_io).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), 0);
|
||||
|
||||
let cluster = Cluster::open(&file_path, false, support_direct_io).unwrap();
|
||||
let group_id = GroupId::new(chunk_size, 0, 0);
|
||||
|
||||
let mut buf = [0u8; 5];
|
||||
let pos = Position::new(group_id, 0);
|
||||
assert!(cluster.pread(pos, &mut buf, 0).is_err());
|
||||
|
||||
cluster.fallocate(group_id, false).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size());
|
||||
|
||||
let bytes = "hello world!".as_bytes();
|
||||
assert!(cluster.pwrite(pos, bytes, 0).is_ok());
|
||||
|
||||
assert!(cluster.pread(pos, &mut buf, 0).is_ok());
|
||||
assert_eq!(&buf, &bytes[0..buf.len()]);
|
||||
|
||||
cluster.fallocate(group_id, true).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size());
|
||||
}
|
||||
|
||||
assert!(Cluster::open(Path::new("/dev/null"), false, support_direct_io).is_err());
|
||||
|
||||
let cluster = Cluster {
|
||||
normal_fd: File::open("/dev/null").unwrap(),
|
||||
direct_fd: File::open("/dev/null").unwrap(),
|
||||
};
|
||||
assert!(cluster.fallocate(GroupId::default(), false).is_err());
|
||||
assert!(cluster.fallocate(GroupId::default(), true).is_err());
|
||||
assert!(cluster.pwrite(Position::from(0), &[1], 0).is_err());
|
||||
|
||||
let cluster = Cluster {
|
||||
normal_fd: unsafe { File::from_raw_fd(23333) },
|
||||
direct_fd: unsafe { File::from_raw_fd(23333) },
|
||||
};
|
||||
let mut buf = [0u8; 32];
|
||||
assert!(cluster.pread(Position::from(0), &mut buf, 0).is_err());
|
||||
std::mem::forget(cluster);
|
||||
|
||||
assert!(Cluster::handle_error(std::io::Error::from_raw_os_error(libc::EINTR)).is_ok());
|
||||
}
|
||||
}
|
||||
118
src/storage/chunk_engine/src/file/clusters.rs
Normal file
118
src/storage/chunk_engine/src/file/clusters.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use super::super::*;
|
||||
use std::{fmt::Debug, os::fd::AsRawFd, path::PathBuf};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ClustersConfig {
|
||||
pub path: PathBuf,
|
||||
pub chunk_size: Size,
|
||||
pub create: bool,
|
||||
}
|
||||
|
||||
pub struct Clusters {
|
||||
pub path: PathBuf,
|
||||
pub chunk_size: Size,
|
||||
files: Vec<Cluster>,
|
||||
}
|
||||
|
||||
impl Clusters {
|
||||
const COUNT: u32 = 256;
|
||||
|
||||
pub fn open(config: &ClustersConfig) -> Result<Self> {
|
||||
let mut files: Vec<Cluster> = vec![];
|
||||
|
||||
if config.create {
|
||||
std::fs::create_dir_all(&config.path)
|
||||
.map_err(|e| Error::IoError(format!("create dir {:?} fail: {e:?}", config.path)))?;
|
||||
}
|
||||
|
||||
let support_direct_io = FsType::check(&config.path).support_direct_io();
|
||||
for cluster_id in 0..Self::COUNT {
|
||||
let file_path = config.path.join(format!("{:02X}", cluster_id));
|
||||
files.push(Cluster::open(&file_path, config.create, support_direct_io)?);
|
||||
}
|
||||
|
||||
Ok(Clusters {
|
||||
path: config.path.clone(),
|
||||
chunk_size: config.chunk_size,
|
||||
files,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn allocate(&self, group_id: GroupId) -> Result<()> {
|
||||
self.files[group_id.cluster() as usize].fallocate(group_id, false)
|
||||
}
|
||||
|
||||
pub fn deallocate(&self, group_id: GroupId) -> Result<()> {
|
||||
self.files[group_id.cluster() as usize].fallocate(group_id, true)
|
||||
}
|
||||
|
||||
pub fn pread(&self, pos: Position, buf: &mut [u8], offset: u32) -> Result<()> {
|
||||
self.files[pos.cluster() as usize].pread(pos, buf, offset)
|
||||
}
|
||||
|
||||
pub fn pwrite(&self, pos: Position, buf: &[u8], offset: u32) -> Result<()> {
|
||||
self.files[pos.cluster() as usize].pwrite(pos, buf, offset)
|
||||
}
|
||||
|
||||
pub fn fd_and_offset(&self, pos: Position) -> FdAndOffset {
|
||||
FdAndOffset {
|
||||
fd: self.files[pos.cluster() as usize].direct_fd.as_raw_fd(),
|
||||
offset: pos.offset().into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for Clusters {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Clusters")
|
||||
.field("path", &self.path)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_clusters() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = ClustersConfig {
|
||||
path: dir.path().into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
|
||||
let clusters = Clusters::open(&config).unwrap();
|
||||
|
||||
let group_id = GroupId::new(CHUNK_SIZE_NORMAL, 0, 0);
|
||||
let cluster = &clusters.files[0];
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), 0);
|
||||
|
||||
clusters.allocate(group_id).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size());
|
||||
|
||||
let group_id_3 = GroupId::new(CHUNK_SIZE_NORMAL, 0, 3);
|
||||
clusters.allocate(group_id_3).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size() * 4);
|
||||
|
||||
clusters.deallocate(group_id).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size() * 4);
|
||||
|
||||
clusters.deallocate(group_id_3).unwrap();
|
||||
let meta = cluster.normal_fd.metadata().unwrap();
|
||||
assert_eq!(meta.len(), group_id.size() * 4);
|
||||
|
||||
let config = ClustersConfig {
|
||||
path: std::path::Path::new("/proc/test").into(),
|
||||
chunk_size: CHUNK_SIZE_NORMAL,
|
||||
create: true,
|
||||
};
|
||||
assert!(Clusters::open(&config).is_err());
|
||||
}
|
||||
}
|
||||
33
src/storage/chunk_engine/src/file/fs_type.rs
Normal file
33
src/storage/chunk_engine/src/file/fs_type.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
use std::{ffi::CString, os::unix::ffi::OsStrExt, path::Path};
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
pub enum FsType {
|
||||
EXT4,
|
||||
NFS,
|
||||
XFS,
|
||||
ZFS,
|
||||
OTHER,
|
||||
}
|
||||
|
||||
impl FsType {
|
||||
pub fn check(path: impl AsRef<Path>) -> Self {
|
||||
let path_cstr = CString::new(path.as_ref().as_os_str().as_bytes()).unwrap();
|
||||
let mut stat: libc::statfs = unsafe { std::mem::zeroed() };
|
||||
let result = unsafe { libc::statfs(path_cstr.as_ptr(), &mut stat) };
|
||||
if result != 0 {
|
||||
Self::OTHER
|
||||
} else {
|
||||
match stat.f_type {
|
||||
libc::EXT4_SUPER_MAGIC => Self::EXT4,
|
||||
libc::NFS_SUPER_MAGIC => Self::NFS,
|
||||
libc::XFS_SUPER_MAGIC => Self::XFS,
|
||||
0x2FC12FC1 => Self::ZFS, // https://github.com/openzfs/zfs/blob/33174af15112ed5c53299da2d28e763b0163f428/include/sys/fs/zfs.h#L1339
|
||||
_ => Self::OTHER,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn support_direct_io(&self) -> bool {
|
||||
!matches!(self, FsType::ZFS)
|
||||
}
|
||||
}
|
||||
7
src/storage/chunk_engine/src/file/mod.rs
Normal file
7
src/storage/chunk_engine/src/file/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
mod cluster;
|
||||
mod clusters;
|
||||
mod fs_type;
|
||||
|
||||
pub use cluster::*;
|
||||
pub use clusters::*;
|
||||
pub use fs_type::*;
|
||||
18
src/storage/chunk_engine/src/lib.rs
Normal file
18
src/storage/chunk_engine/src/lib.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
mod alloc;
|
||||
mod core;
|
||||
mod cxx;
|
||||
mod file;
|
||||
mod meta;
|
||||
mod types;
|
||||
mod utils;
|
||||
|
||||
pub use alloc::*;
|
||||
pub use core::*;
|
||||
pub use cxx::{
|
||||
ffi::{FdAndOffset, GetReq, UpdateReq},
|
||||
CxxString,
|
||||
};
|
||||
pub use file::*;
|
||||
pub use meta::*;
|
||||
pub use types::*;
|
||||
pub use utils::*;
|
||||
217
src/storage/chunk_engine/src/meta/meta_key.rs
Normal file
217
src/storage/chunk_engine/src/meta/meta_key.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
use super::super::{Bytes, Error, GroupId, Position, Result};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
|
||||
pub struct MetaKey(Bytes);
|
||||
|
||||
impl MetaKey {
|
||||
pub const CHUNK_META_KEY_PREFIX: u8 = 1;
|
||||
pub const GROUP_BITS_KEY_PREFIX: u8 = 2;
|
||||
pub const POS_TO_CHUNK_KEY_PREFIX: u8 = 3;
|
||||
pub const USED_SIZE_KEY_PREFIX: u8 = 4;
|
||||
pub const USED_SIZE_PREFIX_LEN_KEY: u8 = 5;
|
||||
pub const TIMESTAMP_KEY_PREFIX: u8 = 6;
|
||||
// pub const WRITING_CHUNK_KEY_PREFIX: u8 = 7;
|
||||
pub const VERSION_KEY: u8 = 8;
|
||||
pub const WRITING_CHUNK_KEY_PREFIX: u8 = 9;
|
||||
pub const TEST_KEY_PREFIX: u8 = b'm';
|
||||
|
||||
fn prefix(mark: u8) -> Self {
|
||||
let mut vec = Bytes::new();
|
||||
vec.push(mark);
|
||||
Self(vec)
|
||||
}
|
||||
|
||||
pub fn chunk_meta_key_prefix() -> Self {
|
||||
Self::prefix(Self::CHUNK_META_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn chunk_meta_key(chunk_id: &[u8]) -> Self {
|
||||
let mut out = Self::chunk_meta_key_prefix();
|
||||
for num in chunk_id {
|
||||
out.0.push(!num)
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
pub fn parse_chunk_meta_key(key: &[u8]) -> Bytes {
|
||||
let mut out = Bytes::new();
|
||||
for num in &key[1..] {
|
||||
out.push(!num);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
pub fn group_bits_key_prefix() -> Self {
|
||||
Self::prefix(Self::GROUP_BITS_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn group_bits_chunk_size_prefix(group_id: GroupId) -> Self {
|
||||
let mut out = Self::group_bits_key_prefix();
|
||||
out.0.extend_from_slice(&group_id.to_be_bytes()[..4]);
|
||||
out
|
||||
}
|
||||
|
||||
pub fn group_bits_key(group_id: GroupId) -> Self {
|
||||
let mut out = Self::group_bits_key_prefix();
|
||||
out.0.extend_from_slice(&group_id.to_be_bytes());
|
||||
out
|
||||
}
|
||||
|
||||
pub fn parse_group_bits_key(key: &[u8]) -> Result<GroupId> {
|
||||
if key.len() == std::mem::size_of::<u8>() + std::mem::size_of::<u64>() {
|
||||
let group_id = BigEndian::read_u64(&key[1..]);
|
||||
Ok(GroupId::from(group_id))
|
||||
} else {
|
||||
Err(Error::MetaError(format!(
|
||||
"parse group bits key fail: {:?}",
|
||||
key
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pos_to_chunk_key_prefix() -> Self {
|
||||
Self::prefix(Self::POS_TO_CHUNK_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn group_to_chunks_key_prefix(group_id: GroupId) -> Self {
|
||||
let mut out = Self::pos_to_chunk_key_prefix();
|
||||
out.0
|
||||
.extend_from_slice(&Position::new(group_id, 0).to_be_bytes());
|
||||
out.0.pop();
|
||||
out
|
||||
}
|
||||
|
||||
pub fn pos_to_chunk_key(pos: Position) -> Self {
|
||||
let mut out = Self::pos_to_chunk_key_prefix();
|
||||
out.0.extend_from_slice(&pos.to_be_bytes());
|
||||
out
|
||||
}
|
||||
|
||||
pub fn parse_pos_to_chunk_key(key: &[u8]) -> Result<Position> {
|
||||
if key.len() == std::mem::size_of::<u8>() + std::mem::size_of::<u64>() {
|
||||
Ok(Position::from(BigEndian::read_u64(&key[1..])))
|
||||
} else {
|
||||
Err(Error::MetaError(format!(
|
||||
"parse pos to chunk key fail: {:?}",
|
||||
key
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn used_size_key_prefix() -> Self {
|
||||
Self::prefix(Self::USED_SIZE_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn used_size_key(prefix: &[u8]) -> Self {
|
||||
let mut out = Self::used_size_key_prefix();
|
||||
out.0.extend_from_slice(prefix);
|
||||
out
|
||||
}
|
||||
|
||||
pub fn used_size_prefix_len_key() -> Self {
|
||||
Self::prefix(Self::USED_SIZE_PREFIX_LEN_KEY)
|
||||
}
|
||||
|
||||
pub fn timestamp_key_prefix() -> Self {
|
||||
Self::prefix(Self::TIMESTAMP_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn timestamp_key_filter(prefix: &[u8], timestamp: u64) -> Self {
|
||||
let mut out = Self::timestamp_key_prefix();
|
||||
out.0.extend_from_slice(prefix);
|
||||
out.0.extend_from_slice(×tamp.to_be_bytes());
|
||||
out
|
||||
}
|
||||
|
||||
pub fn timestamp_key(timestamp: u64, chunk_id: &[u8], prefix_len: usize) -> Self {
|
||||
let mut out = Self::timestamp_key_filter(&chunk_id[..prefix_len], timestamp);
|
||||
out.0.extend_from_slice(&chunk_id[prefix_len..]);
|
||||
out
|
||||
}
|
||||
|
||||
pub fn parse_timestamp_key(key: &[u8], prefix_len: usize) -> Result<(u64, Bytes)> {
|
||||
const L: usize = std::mem::size_of::<u8>() + std::mem::size_of::<u64>();
|
||||
if key.len() > L + prefix_len {
|
||||
let mut chunk_id = Bytes::from(&key[1..1 + prefix_len]);
|
||||
let timestamp = BigEndian::read_u64(&key[1 + prefix_len..]);
|
||||
chunk_id.extend_from_slice(&key[L + prefix_len..]);
|
||||
Ok((timestamp, chunk_id))
|
||||
} else {
|
||||
Err(Error::MetaError(format!(
|
||||
"parse timestamp key fail: {:?}",
|
||||
key
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn version_key() -> Self {
|
||||
Self::prefix(Self::VERSION_KEY)
|
||||
}
|
||||
|
||||
pub fn writing_chunk_key_prefix() -> Self {
|
||||
Self::prefix(Self::WRITING_CHUNK_KEY_PREFIX)
|
||||
}
|
||||
|
||||
pub fn writing_chunk_key(chunk_id: &[u8]) -> Self {
|
||||
let mut out = Self::writing_chunk_key_prefix();
|
||||
out.0.extend_from_slice(chunk_id);
|
||||
out
|
||||
}
|
||||
|
||||
pub fn parse_writing_chunk_key(key: &[u8]) -> Result<Bytes> {
|
||||
if key.len() > 1 {
|
||||
Ok(Bytes::from(&key[1..]))
|
||||
} else {
|
||||
Err(Error::MetaError(format!(
|
||||
"parse writing chunk key fail: {:?}",
|
||||
key
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for MetaKey {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_meta_key_create() {
|
||||
use super::super::super::*;
|
||||
|
||||
let prefix = MetaKey::chunk_meta_key_prefix();
|
||||
assert_eq!(prefix.as_ref(), [MetaKey::CHUNK_META_KEY_PREFIX]);
|
||||
|
||||
let meta_key = MetaKey::chunk_meta_key(&[1, 2, 3, 4]);
|
||||
assert_eq!(
|
||||
meta_key.as_ref(),
|
||||
[MetaKey::CHUNK_META_KEY_PREFIX, !1, !2, !3, !4]
|
||||
);
|
||||
|
||||
let group_id = GroupId::new(CHUNK_SIZE_NORMAL, 1, 2);
|
||||
let pos = Position::new(group_id, 3);
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(pos);
|
||||
assert_eq!(pos_to_chunk_key.as_ref().len(), 1 + 8);
|
||||
let parsed_pos = MetaKey::parse_pos_to_chunk_key(pos_to_chunk_key.as_ref()).unwrap();
|
||||
assert_eq!(pos, parsed_pos);
|
||||
|
||||
let group_to_chunks_key_prefix = MetaKey::group_to_chunks_key_prefix(group_id);
|
||||
assert_eq!(group_to_chunks_key_prefix.as_ref().len(), 8);
|
||||
|
||||
assert!(MetaKey::parse_group_bits_key(&[]).is_err());
|
||||
assert!(MetaKey::parse_pos_to_chunk_key(&[]).is_err());
|
||||
|
||||
let timestamp_key = MetaKey::timestamp_key(1024, &[1, 2, 3, 4], 2);
|
||||
let (timestamp, chunk) = MetaKey::parse_timestamp_key(×tamp_key.0, 2).unwrap();
|
||||
assert_eq!(timestamp, 1024);
|
||||
assert_eq!(chunk, [1, 2, 3, 4].as_slice());
|
||||
|
||||
MetaKey::parse_timestamp_key(&[MetaKey::TIMESTAMP_KEY_PREFIX, 0, 1, 2, 3, 4, 5, 6, 7], 0)
|
||||
.unwrap_err();
|
||||
|
||||
MetaKey::parse_writing_chunk_key(MetaKey::writing_chunk_key_prefix().as_ref()).unwrap_err();
|
||||
}
|
||||
}
|
||||
152
src/storage/chunk_engine/src/meta/meta_merge.rs
Normal file
152
src/storage/chunk_engine/src/meta/meta_merge.rs
Normal file
@@ -0,0 +1,152 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use derse::{DownwardBytes, Serialize};
|
||||
|
||||
use super::super::{GroupState, MergeState, MetaKey};
|
||||
|
||||
pub struct MetaMergeOp;
|
||||
|
||||
impl super::MergeOp for MetaMergeOp {
|
||||
fn full_merge<'a>(
|
||||
key: &[u8],
|
||||
value: Option<&[u8]>,
|
||||
operands: impl Iterator<Item = &'a [u8]>,
|
||||
) -> Option<Vec<u8>> {
|
||||
match key[0] {
|
||||
MetaKey::GROUP_BITS_KEY_PREFIX => {
|
||||
let mut merge_bits = MergeState::empty();
|
||||
for op in operands {
|
||||
merge_bits.merge(&MergeState::from(op).ok()?);
|
||||
}
|
||||
|
||||
let mut bits = if let Some(group_bits) = value {
|
||||
GroupState::from(group_bits).ok()?
|
||||
} else {
|
||||
GroupState::empty()
|
||||
};
|
||||
bits.update(&merge_bits);
|
||||
Some(Vec::from(bits.as_bytes()))
|
||||
}
|
||||
MetaKey::USED_SIZE_KEY_PREFIX => {
|
||||
let mut total = 0i64;
|
||||
for op in operands {
|
||||
if op.len() != std::mem::size_of_val(&total) {
|
||||
return None;
|
||||
}
|
||||
total += LittleEndian::read_i64(op);
|
||||
}
|
||||
if let Some(value) = value {
|
||||
if value.len() != std::mem::size_of_val(&total) {
|
||||
return None;
|
||||
}
|
||||
total += LittleEndian::read_i64(value);
|
||||
}
|
||||
let mut vec = Vec::with_capacity(std::mem::size_of_val(&total));
|
||||
vec.extend_from_slice(&total.to_le_bytes());
|
||||
Some(vec)
|
||||
}
|
||||
MetaKey::TEST_KEY_PREFIX => {
|
||||
let mut out = Vec::<u8>::new();
|
||||
if let Some(value) = value {
|
||||
out.extend_from_slice(value);
|
||||
}
|
||||
for op in operands {
|
||||
out.extend_from_slice(op);
|
||||
}
|
||||
Some(out)
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn partial_merge<'a>(key: &[u8], operands: impl Iterator<Item = &'a [u8]>) -> Option<Vec<u8>> {
|
||||
match key[0] {
|
||||
MetaKey::GROUP_BITS_KEY_PREFIX => {
|
||||
let mut merge_bits = MergeState::empty();
|
||||
for op in operands {
|
||||
merge_bits.merge(&MergeState::from(op).ok()?);
|
||||
}
|
||||
|
||||
if let Ok(bytes) = merge_bits.serialize::<DownwardBytes>() {
|
||||
Some(Vec::from(bytes.as_slice()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
MetaKey::USED_SIZE_KEY_PREFIX => {
|
||||
let mut total = 0i64;
|
||||
for op in operands {
|
||||
if op.len() != std::mem::size_of_val(&total) {
|
||||
return None;
|
||||
}
|
||||
total += LittleEndian::read_i64(op);
|
||||
}
|
||||
let mut vec = Vec::with_capacity(std::mem::size_of_val(&total));
|
||||
vec.extend_from_slice(&total.to_le_bytes());
|
||||
Some(vec)
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::*;
|
||||
|
||||
#[test]
|
||||
fn test_meta_merge_op() {
|
||||
let slice = [233u8].as_slice();
|
||||
assert_eq!(
|
||||
MetaMergeOp::partial_merge(&[233], vec![slice].into_iter()),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_used_size_merge() {
|
||||
let mut ops = Vec::new();
|
||||
for i in 0..10 {
|
||||
let mut vec = Vec::with_capacity(std::mem::size_of::<i64>());
|
||||
vec.extend_from_slice(&(i as i64).to_le_bytes());
|
||||
ops.push(vec);
|
||||
}
|
||||
|
||||
let merged = MetaMergeOp::partial_merge(
|
||||
&[MetaKey::USED_SIZE_KEY_PREFIX],
|
||||
ops.iter().map(|v| v.as_slice()),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(LittleEndian::read_i64(&merged), (0..10).sum::<i64>());
|
||||
|
||||
// test full merge.
|
||||
let mut ops = Vec::new();
|
||||
for i in 0..10 {
|
||||
let mut vec = Vec::with_capacity(std::mem::size_of::<i64>());
|
||||
vec.extend_from_slice(&(i as i64).to_le_bytes());
|
||||
ops.push(vec);
|
||||
}
|
||||
|
||||
let value = 10i64;
|
||||
let merged = MetaMergeOp::full_merge(
|
||||
&[MetaKey::USED_SIZE_KEY_PREFIX],
|
||||
Some(value.to_le_bytes().as_slice()),
|
||||
ops.iter().map(|v| v.as_slice()),
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
LittleEndian::read_i64(&merged),
|
||||
(0..10).sum::<i64>() + value
|
||||
);
|
||||
|
||||
// test invalid ops.
|
||||
let invalid_ops = [vec![1, 2, 3]];
|
||||
assert_eq!(
|
||||
MetaMergeOp::partial_merge(
|
||||
&[MetaKey::USED_SIZE_KEY_PREFIX],
|
||||
invalid_ops.iter().map(|v| v.as_slice()),
|
||||
),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
||||
873
src/storage/chunk_engine/src/meta/meta_store.rs
Normal file
873
src/storage/chunk_engine/src/meta/meta_store.rs
Normal file
@@ -0,0 +1,873 @@
|
||||
use std::{cell::RefCell, collections::HashMap, ops::DerefMut};
|
||||
|
||||
use super::super::*;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use derse::{Deserialize, DownwardBytes, Serialize};
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct MetaStoreConfig {
|
||||
pub rocksdb: RocksDBConfig,
|
||||
pub prefix_len: usize,
|
||||
}
|
||||
|
||||
pub struct MetaStore {
|
||||
rocksdb: RocksDB,
|
||||
config: MetaStoreConfig,
|
||||
}
|
||||
|
||||
impl MetaStore {
|
||||
thread_local! {
|
||||
static BYTES: RefCell<DownwardBytes> = RefCell::new(DownwardBytes::with_capacity(Size::MB.into()));
|
||||
}
|
||||
|
||||
pub fn open(config: &MetaStoreConfig) -> Result<Self> {
|
||||
let rocksdb = RocksDB::open::<MetaMergeOp>(&config.rocksdb)?;
|
||||
|
||||
let mut this = MetaStore {
|
||||
rocksdb,
|
||||
config: config.clone(),
|
||||
};
|
||||
|
||||
this.update_used_size_if_need()?;
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
pub fn get_chunk_meta(&self, chunk_id: &[u8]) -> Result<Option<ChunkMeta>> {
|
||||
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
|
||||
let value = self.rocksdb.get(chunk_meta_key)?;
|
||||
|
||||
if let Some(value) = value {
|
||||
Ok(Some(
|
||||
ChunkMeta::deserialize(value.as_ref()).map_err(Error::SerializationError)?,
|
||||
))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query_chunks(
|
||||
&self,
|
||||
begin: impl AsRef<[u8]>,
|
||||
end: impl AsRef<[u8]>,
|
||||
max_count: u64,
|
||||
) -> Result<Vec<(Bytes, ChunkMeta)>> {
|
||||
let it = self.iterator();
|
||||
self.query_chunks_from_iterator(it, begin, end, max_count)
|
||||
}
|
||||
|
||||
pub fn query_chunks_from_iterator(
|
||||
&self,
|
||||
mut it: RocksDBIterator,
|
||||
begin: impl AsRef<[u8]>,
|
||||
end: impl AsRef<[u8]>,
|
||||
max_count: u64,
|
||||
) -> Result<Vec<(Bytes, ChunkMeta)>> {
|
||||
let mut out = Vec::<(Bytes, ChunkMeta)>::with_capacity(4096);
|
||||
|
||||
let end_key = MetaKey::chunk_meta_key(end.as_ref());
|
||||
it.seek(&end_key)?;
|
||||
|
||||
if it.key() == Some(end_key.as_ref()) {
|
||||
it.next(); // [begin, end)
|
||||
}
|
||||
|
||||
for _ in 0..max_count {
|
||||
if !it.valid() {
|
||||
break;
|
||||
}
|
||||
|
||||
if it.key().unwrap()[0] != MetaKey::CHUNK_META_KEY_PREFIX {
|
||||
break;
|
||||
}
|
||||
|
||||
let chunk_id = MetaKey::parse_chunk_meta_key(it.key().unwrap());
|
||||
if begin.as_ref() <= chunk_id.as_ref() {
|
||||
let chunk_meta = ChunkMeta::deserialize(it.value().unwrap())
|
||||
.map_err(Error::SerializationError)?;
|
||||
out.push((chunk_id, chunk_meta))
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
it.next();
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
pub fn query_chunks_by_timestamp(
|
||||
&self,
|
||||
prefix: &[u8],
|
||||
begin: u64,
|
||||
end: u64,
|
||||
max_count: u64,
|
||||
) -> Result<Vec<Bytes>> {
|
||||
let mut it = self.iterator();
|
||||
let mut out = Vec::<Bytes>::with_capacity(4096);
|
||||
|
||||
let begin_key = MetaKey::timestamp_key_filter(prefix, begin);
|
||||
it.seek(&begin_key)?;
|
||||
|
||||
for _ in 0..max_count {
|
||||
if !it.valid() {
|
||||
break;
|
||||
}
|
||||
|
||||
let key = it.key().unwrap();
|
||||
if key[0] != MetaKey::TIMESTAMP_KEY_PREFIX {
|
||||
break;
|
||||
}
|
||||
if key.len() <= prefix.len() || &key[1..1 + self.config.prefix_len] != prefix {
|
||||
break;
|
||||
}
|
||||
|
||||
let (timestamp, chunk_id) = MetaKey::parse_timestamp_key(key, self.config.prefix_len)?;
|
||||
if timestamp < end {
|
||||
out.push(chunk_id)
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
it.next();
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn write(&self, write_batch: rocksdb::WriteBatch, sync: bool) -> Result<()> {
|
||||
self.rocksdb.write(write_batch, sync)
|
||||
}
|
||||
|
||||
pub fn add_chunk(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta, sync: bool) -> Result<()> {
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
self.add_chunk_mut(chunk_id, chunk_meta, &mut write_batch)?;
|
||||
self.write(write_batch, sync)
|
||||
}
|
||||
|
||||
pub fn add_chunk_mut(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
chunk_meta: &ChunkMeta,
|
||||
write_batch: &mut rocksdb::WriteBatch,
|
||||
) -> Result<()> {
|
||||
// 1. add chunk meta.
|
||||
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
chunk_meta
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.put(chunk_meta_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 2. add pos->chunk map.
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
|
||||
write_batch.put(pos_to_chunk_key, chunk_id);
|
||||
|
||||
// 3. update group bits.
|
||||
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::acquire(chunk_meta.pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 4. update used size.
|
||||
self.update_used_size(chunk_id, chunk_meta.pos.chunk_size().0 as i64, write_batch)?;
|
||||
|
||||
// 5. add timestamp->chunk map.
|
||||
let timestamp_key =
|
||||
MetaKey::timestamp_key(chunk_meta.timestamp, chunk_id, self.config.prefix_len);
|
||||
write_batch.put(timestamp_key, chunk_id);
|
||||
|
||||
// 6. remove writing chunk log.
|
||||
self.remove_writing_chunk_mut(chunk_id, write_batch);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn move_chunk(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
old_meta: &ChunkMeta,
|
||||
new_meta: &ChunkMeta,
|
||||
sync: bool,
|
||||
) -> Result<()> {
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
self.move_chunk_mut(chunk_id, old_meta, new_meta, &mut write_batch)?;
|
||||
self.write(write_batch, sync)
|
||||
}
|
||||
|
||||
pub fn move_chunk_mut(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
old_meta: &ChunkMeta,
|
||||
new_meta: &ChunkMeta,
|
||||
write_batch: &mut rocksdb::WriteBatch,
|
||||
) -> Result<()> {
|
||||
// 1. change chunk meta.
|
||||
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
new_meta
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.put(chunk_meta_key, bytes.as_slice());
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
if old_meta.pos != new_meta.pos {
|
||||
// 2. remove old pos->chunk map.
|
||||
let old_pos = old_meta.pos;
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(old_pos);
|
||||
write_batch.delete(pos_to_chunk_key);
|
||||
|
||||
let group_bits_key = MetaKey::group_bits_key(old_pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::release(old_pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 3. add new pos->chunk map.
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(new_meta.pos);
|
||||
write_batch.put(pos_to_chunk_key, chunk_id);
|
||||
|
||||
let group_bits_key = MetaKey::group_bits_key(new_meta.pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::acquire(new_meta.pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 4. update used size.
|
||||
self.update_used_size(
|
||||
chunk_id,
|
||||
new_meta.pos.chunk_size().0 as i64 - old_pos.chunk_size().0 as i64,
|
||||
write_batch,
|
||||
)?;
|
||||
}
|
||||
|
||||
// 5. update timestamp->chunk map.
|
||||
self.check_chunk_id(chunk_id)?;
|
||||
let timestamp_key =
|
||||
MetaKey::timestamp_key(new_meta.timestamp, chunk_id, self.config.prefix_len);
|
||||
write_batch.put(timestamp_key, []);
|
||||
let timestamp_key =
|
||||
MetaKey::timestamp_key(old_meta.timestamp, chunk_id, self.config.prefix_len);
|
||||
write_batch.delete(timestamp_key);
|
||||
|
||||
// 6. remove writing chunk log.
|
||||
self.remove_writing_chunk_mut(chunk_id, write_batch);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn remove(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta, sync: bool) -> Result<()> {
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
self.remove_mut(chunk_id, chunk_meta, &mut write_batch)?;
|
||||
self.write(write_batch, sync)
|
||||
}
|
||||
|
||||
pub fn remove_mut(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
chunk_meta: &ChunkMeta,
|
||||
write_batch: &mut rocksdb::WriteBatch,
|
||||
) -> Result<()> {
|
||||
// 1. delete chunk meta.
|
||||
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
|
||||
write_batch.delete(chunk_meta_key);
|
||||
|
||||
// 2. delete pos->chunk map.
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
|
||||
write_batch.delete(pos_to_chunk_key);
|
||||
|
||||
// 3. release position.
|
||||
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::release(chunk_meta.pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 4. update used size.
|
||||
self.update_used_size(
|
||||
chunk_id,
|
||||
-(chunk_meta.pos.chunk_size().0 as i64),
|
||||
write_batch,
|
||||
)?;
|
||||
|
||||
// 5. delete timestamp->chunk map.
|
||||
let timestamp_key =
|
||||
MetaKey::timestamp_key(chunk_meta.timestamp, chunk_id, self.config.prefix_len);
|
||||
write_batch.delete(timestamp_key);
|
||||
|
||||
// 6. remove writing chunk log.
|
||||
self.remove_writing_chunk_mut(chunk_id, write_batch);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn allocate_group(&self, group_id: GroupId) -> Result<()> {
|
||||
let group_bits_key = MetaKey::group_bits_key(group_id);
|
||||
self.rocksdb
|
||||
.put(group_bits_key, GroupState::empty().as_bytes(), true)
|
||||
}
|
||||
|
||||
pub fn remove_group(&self, group_id: GroupId) -> Result<()> {
|
||||
let group_bits_key = MetaKey::group_bits_key(group_id);
|
||||
self.rocksdb.delete(group_bits_key, true)
|
||||
}
|
||||
|
||||
pub fn iterator(&self) -> RocksDBIterator {
|
||||
self.rocksdb.new_iterator()
|
||||
}
|
||||
|
||||
fn update_used_size(
|
||||
&self,
|
||||
chunk_id: &[u8],
|
||||
diff: i64,
|
||||
write_batch: &mut rocksdb::WriteBatch,
|
||||
) -> Result<()> {
|
||||
self.check_chunk_id(chunk_id)?;
|
||||
let used_size_key = MetaKey::used_size_key(&chunk_id[..self.config.prefix_len]);
|
||||
write_batch.merge(used_size_key, diff.to_le_bytes());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn persist_writing_chunk(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta) -> Result<()> {
|
||||
let chunk_meta_key = MetaKey::writing_chunk_key(chunk_id);
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
chunk_meta
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
self.rocksdb.put(chunk_meta_key, &bytes[..], true)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn remove_writing_chunk_mut(&self, chunk_id: &[u8], write_batch: &mut rocksdb::WriteBatch) {
|
||||
write_batch.delete(MetaKey::writing_chunk_key(chunk_id));
|
||||
}
|
||||
|
||||
pub fn occupy_uncommitted_positions(&mut self) -> Result<Vec<(Bytes, ChunkMeta, bool)>> {
|
||||
let mut prefix_len = 0;
|
||||
std::mem::swap(&mut self.config.prefix_len, &mut prefix_len);
|
||||
let list = self.query_uncommitted_chunks(&[])?;
|
||||
std::mem::swap(&mut self.config.prefix_len, &mut prefix_len);
|
||||
|
||||
let mut uncommitted_chunks = vec![];
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
let mut count = 0;
|
||||
for (chunk_id, writing_meta) in list {
|
||||
let pos = writing_meta.pos;
|
||||
match self.get_chunk_meta(&chunk_id)? {
|
||||
Some(meta) if meta.pos == writing_meta.pos => {
|
||||
uncommitted_chunks.push((chunk_id, writing_meta, false));
|
||||
}
|
||||
_ => {
|
||||
uncommitted_chunks.push((chunk_id.clone(), writing_meta, true));
|
||||
|
||||
count += 1;
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(pos);
|
||||
write_batch.put(pos_to_chunk_key, chunk_id);
|
||||
|
||||
let group_bits_key = MetaKey::group_bits_key(pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::acquire(pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !uncommitted_chunks.is_empty() {
|
||||
self.write(write_batch, true)?;
|
||||
tracing::info!("occupy {} positions for writing chunks", count);
|
||||
}
|
||||
Ok(uncommitted_chunks)
|
||||
}
|
||||
|
||||
pub fn vacate_uncommitted_positions(
|
||||
&self,
|
||||
uncommitted_chunks: Vec<(Bytes, ChunkMeta, bool)>,
|
||||
) -> Result<()> {
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
let mut count = 0;
|
||||
for (_, chunk_meta, occupied) in uncommitted_chunks {
|
||||
if !occupied {
|
||||
continue;
|
||||
}
|
||||
|
||||
count += 1;
|
||||
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
|
||||
write_batch.delete(pos_to_chunk_key);
|
||||
|
||||
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
|
||||
Self::with_tls_bytes(|bytes| {
|
||||
MergeState::release(chunk_meta.pos.index())
|
||||
.serialize_to(bytes)
|
||||
.map_err(Error::SerializationError)?;
|
||||
write_batch.merge(group_bits_key, &bytes[..]);
|
||||
Ok(())
|
||||
})?;
|
||||
}
|
||||
self.write(write_batch, true)?;
|
||||
tracing::info!("vacate {} positions for writing chunks", count);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn query_uncommitted_chunks(&self, prefix: &[u8]) -> Result<Vec<(Bytes, ChunkMeta)>> {
|
||||
self.check_prefix(prefix)?;
|
||||
|
||||
let mut it = self.iterator();
|
||||
let mut out = Vec::<(Bytes, ChunkMeta)>::with_capacity(4096);
|
||||
|
||||
let end_key = MetaKey::writing_chunk_key(prefix);
|
||||
it.seek(&end_key)?;
|
||||
|
||||
if it.key() == Some(end_key.as_ref()) {
|
||||
it.next(); // [begin, end)
|
||||
}
|
||||
|
||||
loop {
|
||||
if !it.valid() {
|
||||
break;
|
||||
}
|
||||
|
||||
if it.key().unwrap()[0] != MetaKey::WRITING_CHUNK_KEY_PREFIX {
|
||||
break;
|
||||
}
|
||||
|
||||
let chunk_id = MetaKey::parse_writing_chunk_key(it.key().unwrap())?;
|
||||
if prefix <= chunk_id.as_ref() {
|
||||
let chunk_meta = ChunkMeta::deserialize(it.value().unwrap())
|
||||
.map_err(Error::SerializationError)?;
|
||||
out.push((chunk_id, chunk_meta))
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
it.next();
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn check_chunk_id(&self, chunk_id: &[u8]) -> Result<()> {
|
||||
let prefix_len = self.config.prefix_len;
|
||||
if chunk_id.len() < prefix_len {
|
||||
return Err(Error::InvalidArg(format!(
|
||||
"chunk_id.len() < prefix len: {:?}, {}",
|
||||
chunk_id, prefix_len
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn check_prefix(&self, prefix: &[u8]) -> Result<()> {
|
||||
let prefix_len = self.config.prefix_len;
|
||||
if prefix.len() != prefix_len {
|
||||
return Err(Error::InvalidArg(format!(
|
||||
"prefix.len() != prefix len: {:?}, {}",
|
||||
prefix, prefix_len
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn query_used_size(&self, prefix: &[u8]) -> Result<u64> {
|
||||
self.check_prefix(prefix)?;
|
||||
|
||||
let used_size_key = MetaKey::used_size_key(prefix);
|
||||
let value = self.rocksdb.get(used_size_key)?;
|
||||
if let Some(size) = value {
|
||||
if size.len() != std::mem::size_of::<u64>() {
|
||||
Err(Error::InvalidArg(format!(
|
||||
"invalid size length: {:?}",
|
||||
size.as_ref()
|
||||
)))
|
||||
} else {
|
||||
Ok(LittleEndian::read_u64(size.as_ref()))
|
||||
}
|
||||
} else {
|
||||
Ok(0)
|
||||
}
|
||||
}
|
||||
|
||||
fn with_tls_bytes<F, R>(f: F) -> R
|
||||
where
|
||||
F: FnOnce(&mut DownwardBytes) -> R,
|
||||
{
|
||||
Self::BYTES.with(|v| {
|
||||
let mut bytes = v.borrow_mut();
|
||||
let result = f(bytes.deref_mut());
|
||||
bytes.clear_and_shrink_to(Size::MB.into());
|
||||
result
|
||||
})
|
||||
}
|
||||
|
||||
fn update_used_size_if_need(&mut self) -> Result<()> {
|
||||
let old_len = match self.rocksdb.get(MetaKey::used_size_prefix_len_key())? {
|
||||
Some(value) => {
|
||||
if value.len() != std::mem::size_of::<u32>() {
|
||||
return Err(Error::InvalidArg(format!(
|
||||
"invalid used size prefix length: {:?}",
|
||||
value.as_ref()
|
||||
)));
|
||||
}
|
||||
LittleEndian::read_u32(value.as_ref()) as usize
|
||||
}
|
||||
None => 0,
|
||||
};
|
||||
|
||||
let prefix_len = self.config.prefix_len;
|
||||
if old_len == prefix_len {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut map = HashMap::<Bytes, u64>::new();
|
||||
if prefix_len == 0 {
|
||||
map.insert(Bytes::new(), 0);
|
||||
}
|
||||
let mut it = self.iterator();
|
||||
it.iterate(MetaKey::chunk_meta_key_prefix(), |key, value| {
|
||||
let mut chunk_id = MetaKey::parse_chunk_meta_key(key);
|
||||
chunk_id.resize(prefix_len, 0);
|
||||
let chunk_meta = ChunkMeta::deserialize(value).map_err(Error::SerializationError)?;
|
||||
let chunk_size = chunk_meta.pos.chunk_size().0;
|
||||
map.entry(chunk_id)
|
||||
.and_modify(|v| *v += chunk_size)
|
||||
.or_insert(chunk_size);
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
write_batch.put(
|
||||
MetaKey::used_size_prefix_len_key(),
|
||||
(prefix_len as u32).to_le_bytes(),
|
||||
);
|
||||
for (prefix, size) in map {
|
||||
write_batch.put(MetaKey::used_size_key(&prefix), size.to_le_bytes())
|
||||
}
|
||||
self.write(write_batch, true)
|
||||
}
|
||||
|
||||
pub const V1_FIX_TIMESTAMP: u8 = 1;
|
||||
pub const LATEST_VERSION: u8 = Self::V1_FIX_TIMESTAMP;
|
||||
|
||||
pub fn get_version(&self) -> Result<u8> {
|
||||
match self.rocksdb.get(MetaKey::version_key())? {
|
||||
Some(value) if !value.is_empty() => Ok(value[0]),
|
||||
Some(value) => Err(Error::InvalidArg(format!(
|
||||
"invalid version: {:?}",
|
||||
value.as_ref()
|
||||
))),
|
||||
None => Ok(0),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_version(&self, version: u8) -> Result<()> {
|
||||
self.rocksdb.put(MetaKey::version_key(), [version], true)
|
||||
}
|
||||
|
||||
pub fn remove_range_mut(
|
||||
&self,
|
||||
prefix: u8,
|
||||
write_batch: &mut rocksdb::WriteBatch,
|
||||
) -> Result<()> {
|
||||
if prefix == MetaKey::CHUNK_META_KEY_PREFIX
|
||||
|| prefix == MetaKey::GROUP_BITS_KEY_PREFIX
|
||||
|| prefix == MetaKey::POS_TO_CHUNK_KEY_PREFIX
|
||||
|| prefix == MetaKey::USED_SIZE_KEY_PREFIX
|
||||
|| prefix == MetaKey::USED_SIZE_PREFIX_LEN_KEY
|
||||
{
|
||||
return Err(Error::InvalidArg(format!(
|
||||
"invalid remove range: {}",
|
||||
prefix
|
||||
)));
|
||||
}
|
||||
|
||||
write_batch.delete_range(&[prefix], &[prefix + 1]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_meta_store_normal() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
|
||||
let chunk_id = "1000".as_bytes();
|
||||
let chunk_meta = meta_store.get_chunk_meta(chunk_id).unwrap();
|
||||
assert!(chunk_meta.is_none());
|
||||
|
||||
let chunk_meta_in = ChunkMeta {
|
||||
chunk_ver: 1,
|
||||
..Default::default()
|
||||
};
|
||||
meta_store
|
||||
.add_chunk(chunk_id, &chunk_meta_in, false)
|
||||
.unwrap();
|
||||
|
||||
let chunk_id = "1000".as_bytes();
|
||||
let chunk_meta_out = meta_store.get_chunk_meta(chunk_id).unwrap().unwrap();
|
||||
assert_eq!(chunk_meta_in, chunk_meta_out);
|
||||
assert_eq!(meta_store.query_chunks([], "100", 10).unwrap().len(), 1);
|
||||
|
||||
meta_store.remove(chunk_id, &chunk_meta_out, false).unwrap();
|
||||
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
meta_store
|
||||
.remove_range_mut(MetaKey::CHUNK_META_KEY_PREFIX, &mut write_batch)
|
||||
.unwrap_err();
|
||||
|
||||
meta_store
|
||||
.rocksdb
|
||||
.put(MetaKey::version_key(), &[], false)
|
||||
.unwrap();
|
||||
meta_store.get_version().unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_meta_get_set() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
|
||||
let group_id = GroupId::default();
|
||||
let mut chunk_meta = ChunkMeta::default();
|
||||
for i in 0..128u32 {
|
||||
chunk_meta.pos = Position::new(group_id, 2 * i as u8);
|
||||
meta_store
|
||||
.add_chunk(&i.to_be_bytes(), &chunk_meta, false)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let vec = meta_store
|
||||
.query_chunks(10u32.to_be_bytes(), 20u32.to_be_bytes(), 30)
|
||||
.unwrap();
|
||||
assert_eq!(vec.len(), 10);
|
||||
assert_eq!(vec.first().unwrap().0.as_ref(), &19u32.to_be_bytes());
|
||||
assert_eq!(vec.last().unwrap().0.as_ref(), &10u32.to_be_bytes());
|
||||
|
||||
let vec = meta_store
|
||||
.query_chunks(80u32.to_be_bytes(), 100u32.to_be_bytes(), 30)
|
||||
.unwrap();
|
||||
assert_eq!(vec.len(), 20);
|
||||
|
||||
let mut it = meta_store.iterator();
|
||||
let mut count = 0;
|
||||
it.iterate(MetaKey::group_bits_key_prefix(), |_key, value| {
|
||||
count += 1;
|
||||
let bits = GroupState::from(value)?;
|
||||
assert_eq!(bits.count(), 128);
|
||||
for i in 0..128 {
|
||||
assert!(bits.check(i * 2));
|
||||
assert!(!bits.check(i * 2 + 1));
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(count, 1);
|
||||
|
||||
for i in 0..128u32 {
|
||||
chunk_meta.pos = Position::new(group_id, 1 + 2 * i as u8);
|
||||
meta_store
|
||||
.add_chunk(&i.to_be_bytes(), &chunk_meta, false)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let mut it = meta_store.iterator();
|
||||
let mut count = 0;
|
||||
it.iterate(MetaKey::group_bits_key_prefix(), |_key, value| {
|
||||
count += 1;
|
||||
let bits = GroupState::from(value)?;
|
||||
assert_eq!(bits.count(), 256);
|
||||
assert!(bits.is_full());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
assert_eq!(count, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_meta_store_open_failed() {
|
||||
let config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: "/proc/test".into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
assert!(MetaStore::open(&config).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_meta_store_update_used_size() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
prefix_len: 4,
|
||||
};
|
||||
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
|
||||
let chunk_id = [0, 1, 2, 3];
|
||||
let group_id = GroupId::default();
|
||||
let chunk_meta = ChunkMeta {
|
||||
pos: Position::new(group_id, 0_u8),
|
||||
..Default::default()
|
||||
};
|
||||
meta_store
|
||||
.add_chunk(&chunk_id[..3], &chunk_meta, false)
|
||||
.unwrap_err();
|
||||
meta_store.add_chunk(&chunk_id, &chunk_meta, false).unwrap();
|
||||
|
||||
meta_store.query_used_size(&chunk_id[..3]).unwrap_err();
|
||||
assert_eq!(
|
||||
meta_store.query_used_size(&chunk_id).unwrap(),
|
||||
CHUNK_SIZE_NORMAL
|
||||
);
|
||||
assert_eq!(meta_store.query_used_size(&0u32.to_le_bytes()).unwrap(), 0);
|
||||
|
||||
meta_store
|
||||
.query_chunks_by_timestamp(&0u32.to_le_bytes(), 0, u64::MAX, u64::MAX)
|
||||
.unwrap();
|
||||
|
||||
meta_store.remove(&chunk_id, &chunk_meta, false).unwrap();
|
||||
assert_eq!(meta_store.query_used_size(&chunk_id).unwrap(), 0);
|
||||
|
||||
let key = MetaKey::used_size_key(&chunk_id);
|
||||
meta_store.rocksdb.put(key, [], false).unwrap();
|
||||
meta_store.query_used_size(&chunk_id).unwrap_err();
|
||||
|
||||
meta_store
|
||||
.rocksdb
|
||||
.put(MetaKey::used_size_prefix_len_key(), [233], false)
|
||||
.unwrap();
|
||||
drop(meta_store);
|
||||
assert!(MetaStore::open(&config).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_meta_store_update_used_size_prefix_len() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let mut config = MetaStoreConfig {
|
||||
rocksdb: RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
},
|
||||
prefix_len: 0,
|
||||
};
|
||||
|
||||
const N: u64 = 1024;
|
||||
let start = ChunkMeta::now();
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
for i in 0..N {
|
||||
let chunk_id = i.to_le_bytes();
|
||||
let id = i as u8;
|
||||
let chunk_size = if id % 2 == 0 {
|
||||
CHUNK_SIZE_NORMAL
|
||||
} else {
|
||||
CHUNK_SIZE_SMALL
|
||||
};
|
||||
|
||||
let pos = Position::new(GroupId::new(chunk_size, 0, 0), id);
|
||||
let meta = ChunkMeta {
|
||||
pos,
|
||||
..Default::default()
|
||||
};
|
||||
meta_store.add_chunk(&chunk_id, &meta, false).unwrap();
|
||||
}
|
||||
|
||||
let size = meta_store.query_used_size(&[]).unwrap();
|
||||
assert_eq!(size, N / 2 * (CHUNK_SIZE_NORMAL.0 + CHUNK_SIZE_SMALL.0));
|
||||
|
||||
let mut write_batch = RocksDB::new_write_batch();
|
||||
write_batch.put("m", "m");
|
||||
meta_store.write(write_batch, false).unwrap();
|
||||
|
||||
let end = ChunkMeta::now();
|
||||
let vec = meta_store
|
||||
.query_chunks_by_timestamp(&[], 0, start, u64::MAX)
|
||||
.unwrap();
|
||||
assert!(vec.is_empty());
|
||||
let vec = meta_store
|
||||
.query_chunks_by_timestamp(&[], start, end + 1, u64::MAX)
|
||||
.unwrap();
|
||||
assert_eq!(vec.len(), N as usize);
|
||||
|
||||
drop(meta_store);
|
||||
|
||||
config.prefix_len = 1;
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
for i in 0..=u8::MAX {
|
||||
let size = meta_store.query_used_size(&[i]).unwrap();
|
||||
if i % 2 == 0 {
|
||||
assert_eq!(size, N / 256 * CHUNK_SIZE_NORMAL.0);
|
||||
} else {
|
||||
assert_eq!(size, N / 256 * CHUNK_SIZE_SMALL.0);
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..N {
|
||||
let chunk_id = i.to_le_bytes();
|
||||
let meta = meta_store.get_chunk_meta(&chunk_id).unwrap().unwrap();
|
||||
meta_store.remove(&chunk_id, &meta, false).unwrap();
|
||||
}
|
||||
for i in 0..=u8::MAX {
|
||||
let size = meta_store.query_used_size(&[i]).unwrap();
|
||||
assert_eq!(size, 0);
|
||||
}
|
||||
|
||||
drop(meta_store);
|
||||
config.prefix_len = 0;
|
||||
let meta_store = MetaStore::open(&config).unwrap();
|
||||
let size = meta_store.query_used_size(&[]).unwrap();
|
||||
assert_eq!(size, 0);
|
||||
}
|
||||
}
|
||||
9
src/storage/chunk_engine/src/meta/mod.rs
Normal file
9
src/storage/chunk_engine/src/meta/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
mod meta_key;
|
||||
mod meta_merge;
|
||||
mod meta_store;
|
||||
mod rocksdb;
|
||||
|
||||
pub use meta_key::*;
|
||||
pub use meta_merge::*;
|
||||
pub use meta_store::*;
|
||||
pub use rocksdb::*;
|
||||
314
src/storage/chunk_engine/src/meta/rocksdb.rs
Normal file
314
src/storage/chunk_engine/src/meta/rocksdb.rs
Normal file
@@ -0,0 +1,314 @@
|
||||
use crate::{Error, Result, Size};
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct RocksDBConfig {
|
||||
pub path: PathBuf,
|
||||
pub create: bool,
|
||||
pub read_only: bool,
|
||||
}
|
||||
|
||||
pub struct RocksDB {
|
||||
db: rocksdb::DB,
|
||||
write_options: [rocksdb::WriteOptions; 2], // 0 for non-sync, 1 for sync.
|
||||
}
|
||||
|
||||
pub trait MergeOp {
|
||||
fn full_merge<'a>(
|
||||
key: &[u8],
|
||||
value: Option<&[u8]>,
|
||||
operands: impl Iterator<Item = &'a [u8]>,
|
||||
) -> Option<Vec<u8>>;
|
||||
|
||||
fn partial_merge<'a>(key: &[u8], operands: impl Iterator<Item = &'a [u8]>) -> Option<Vec<u8>>;
|
||||
}
|
||||
|
||||
impl RocksDB {
|
||||
pub fn open<T: MergeOp + 'static>(config: &RocksDBConfig) -> Result<Self> {
|
||||
let mut db_options = rocksdb::Options::default();
|
||||
db_options.create_if_missing(config.create);
|
||||
db_options.set_merge_operator(
|
||||
"merge",
|
||||
|key, value, operands| T::full_merge(key, value, operands.iter()),
|
||||
|key, _value, operands| T::partial_merge(key, operands.iter()),
|
||||
);
|
||||
|
||||
let mut table_options = rocksdb::BlockBasedOptions::default();
|
||||
table_options.set_bloom_filter(10.0, true);
|
||||
db_options.set_block_based_table_factory(&table_options);
|
||||
|
||||
let db = if config.read_only {
|
||||
rocksdb::DB::open_for_read_only(&db_options, &config.path, false)
|
||||
} else {
|
||||
rocksdb::DB::open(&db_options, &config.path)
|
||||
}
|
||||
.map_err(|err| Error::RocksDBError(format!("open rocksdb fail: {:?}", err)))?;
|
||||
|
||||
let mut sync_write_options = rocksdb::WriteOptions::new();
|
||||
sync_write_options.set_sync(true);
|
||||
|
||||
Ok(Self {
|
||||
db,
|
||||
write_options: [rocksdb::WriteOptions::new(), sync_write_options],
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get(&self, key: impl AsRef<[u8]>) -> Result<Option<rocksdb::DBPinnableSlice>> {
|
||||
match self.db.get_pinned(key) {
|
||||
Ok(v) => Ok(v),
|
||||
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>, sync: bool) -> Result<()> {
|
||||
match self
|
||||
.db
|
||||
.put_opt(key, value, &self.write_options[sync as usize])
|
||||
{
|
||||
Ok(v) => Ok(v),
|
||||
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(&self, key: impl AsRef<[u8]>, sync: bool) -> Result<()> {
|
||||
match self.db.delete_opt(key, &self.write_options[sync as usize]) {
|
||||
Ok(v) => Ok(v),
|
||||
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_write_batch() -> rocksdb::WriteBatch {
|
||||
rocksdb::WriteBatch::default()
|
||||
}
|
||||
|
||||
pub fn write(&self, batch: rocksdb::WriteBatch, sync: bool) -> Result<()> {
|
||||
match self.db.write_opt(batch, &self.write_options[sync as usize]) {
|
||||
Ok(v) => Ok(v),
|
||||
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_iterator(&self) -> RocksDBIterator {
|
||||
let mut read_options = rocksdb::ReadOptions::default();
|
||||
read_options.set_readahead_size(Size::mebibyte(4).into());
|
||||
RocksDBIterator(self.db.raw_iterator_opt(read_options))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RocksDB {
|
||||
fn drop(&mut self) {
|
||||
tracing::info!("RocksDB {:?} is closing...", self.db);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RocksDBIterator<'a>(rocksdb::DBRawIterator<'a>);
|
||||
|
||||
impl RocksDBIterator<'_> {
|
||||
pub fn iterate<P, Fn>(&mut self, prefix: P, mut func: Fn) -> Result<u32>
|
||||
where
|
||||
P: AsRef<[u8]>,
|
||||
Fn: FnMut(&[u8], &[u8]) -> Result<()>,
|
||||
{
|
||||
let it = &mut self.0;
|
||||
it.seek(prefix.as_ref());
|
||||
let mut count = 0;
|
||||
while it.valid() && it.key().unwrap().starts_with(prefix.as_ref()) {
|
||||
func(it.key().unwrap(), it.value().unwrap_or(&[]))?;
|
||||
it.next();
|
||||
count += 1;
|
||||
}
|
||||
self.status()?;
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
pub fn seek<P>(&mut self, prefix: P) -> Result<()>
|
||||
where
|
||||
P: AsRef<[u8]>,
|
||||
{
|
||||
self.0.seek(prefix.as_ref());
|
||||
self.status()
|
||||
}
|
||||
|
||||
pub fn valid(&self) -> bool {
|
||||
self.0.valid()
|
||||
}
|
||||
|
||||
pub fn status(&self) -> Result<()> {
|
||||
self.0
|
||||
.status()
|
||||
.map_err(|e| Error::RocksDBError(e.to_string()))
|
||||
}
|
||||
|
||||
pub fn next(&mut self) {
|
||||
self.0.next();
|
||||
}
|
||||
|
||||
pub fn key(&self) -> Option<&[u8]> {
|
||||
self.0.key()
|
||||
}
|
||||
|
||||
pub fn value(&self) -> Option<&[u8]> {
|
||||
self.0.value()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_rocksdb_create_get_set() {
|
||||
use super::super::*;
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let rocksdb = RocksDB::open::<MetaMergeOp>(&config).unwrap();
|
||||
|
||||
let value = rocksdb.get("merry".as_bytes()).unwrap();
|
||||
assert!(value.is_none());
|
||||
|
||||
rocksdb
|
||||
.put("merry".as_bytes(), "world".as_bytes(), false)
|
||||
.unwrap();
|
||||
|
||||
let value = rocksdb.get("merry".as_bytes()).unwrap();
|
||||
assert_eq!(value.as_deref(), Some("world".as_bytes()));
|
||||
|
||||
let mut batch = RocksDB::new_write_batch();
|
||||
batch.put("merry", "RocksDB");
|
||||
batch.put("peace", "love");
|
||||
rocksdb.write(batch, false).unwrap();
|
||||
|
||||
let value = rocksdb.get("merry".as_bytes()).unwrap();
|
||||
assert_eq!(value.as_deref(), Some("RocksDB".as_bytes()));
|
||||
let value = rocksdb.get("peace".as_bytes()).unwrap();
|
||||
assert_eq!(value.as_deref(), Some("love".as_bytes()));
|
||||
|
||||
let mut batch = RocksDB::new_write_batch();
|
||||
batch.merge("merry", "1");
|
||||
batch.merge("merry", "2");
|
||||
for i in 0..16 {
|
||||
batch.merge("merge", format!("{i}"));
|
||||
}
|
||||
rocksdb.write(batch, false).unwrap();
|
||||
|
||||
let value = rocksdb.get("merry".as_bytes()).unwrap();
|
||||
assert_eq!(value.as_deref(), Some("RocksDB12".as_bytes()));
|
||||
let value = rocksdb.get("merge".as_bytes()).unwrap();
|
||||
assert_eq!(value.as_deref(), Some("0123456789101112131415".as_bytes()));
|
||||
|
||||
let mut it = rocksdb.new_iterator();
|
||||
let mut count = 0;
|
||||
let mut runner = |_: &[u8], _: &[u8]| {
|
||||
count += 1;
|
||||
crate::Result::Ok(())
|
||||
};
|
||||
|
||||
assert_eq!(it.iterate([], &mut runner).unwrap(), 3);
|
||||
assert_eq!(it.iterate("m", &mut runner).unwrap(), 2);
|
||||
assert_eq!(it.iterate("a", &mut runner).unwrap(), 0);
|
||||
assert_eq!(it.iterate("z", &mut runner).unwrap(), 0);
|
||||
|
||||
let config = RocksDBConfig {
|
||||
path: std::path::Path::new("/proc/test").into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
};
|
||||
assert!(RocksDB::open::<MetaMergeOp>(&config).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rocksdb_parallel_write() {
|
||||
use super::super::*;
|
||||
use std::sync::Arc;
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let rocksdb = Arc::new(RocksDB::open::<MetaMergeOp>(&config).unwrap());
|
||||
|
||||
const T: usize = 16;
|
||||
const N: usize = 1000;
|
||||
let mut threads = vec![];
|
||||
for i in 0..T {
|
||||
let rocksdb = rocksdb.clone();
|
||||
threads.push(
|
||||
std::thread::Builder::new()
|
||||
.name(format!("test-{i}"))
|
||||
.spawn(move || {
|
||||
for j in 0..N {
|
||||
let value = [j as u8; 32];
|
||||
let mut batch = RocksDB::new_write_batch();
|
||||
batch.put(format!("a{}atesta", i * N + j), value);
|
||||
batch.put(format!("b{}btestb", i * N + j), value);
|
||||
batch.merge(format!("m{}mtestm", i * N + j), value);
|
||||
rocksdb.write(batch, false).unwrap();
|
||||
}
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
for thread in threads {
|
||||
thread.join().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_rocksdb_invalid_merge() {
|
||||
use super::super::*;
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let config = RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let rocksdb = RocksDB::open::<MetaMergeOp>(&config).unwrap();
|
||||
|
||||
let mut batch = RocksDB::new_write_batch();
|
||||
batch.merge("invalid_merge", "");
|
||||
rocksdb.write(batch, false).unwrap();
|
||||
|
||||
assert!(rocksdb.get("invalid_merge").is_err());
|
||||
|
||||
let mut runner = |_: &[u8], _: &[u8]| crate::Result::Ok(());
|
||||
let mut it = rocksdb.new_iterator();
|
||||
assert!(it.iterate("invalid_merge", &mut runner).is_err());
|
||||
|
||||
assert!(it.seek("invalid_merge").is_err());
|
||||
|
||||
assert!(rocksdb.put("invalid_merge", "ok", false).is_ok());
|
||||
assert!(rocksdb.get("invalid_merge").is_ok());
|
||||
drop(it);
|
||||
|
||||
let mut it = rocksdb.new_iterator();
|
||||
assert_eq!(it.iterate("invalid_merge", &mut runner), Ok(1));
|
||||
|
||||
it.seek("invalid_merge").unwrap();
|
||||
assert!(it.valid());
|
||||
assert_eq!(it.key().unwrap(), "invalid_merge".as_bytes());
|
||||
assert_eq!(it.value().unwrap(), "ok".as_bytes());
|
||||
|
||||
it.next();
|
||||
assert!(!it.valid());
|
||||
assert!(it.status().is_ok());
|
||||
|
||||
drop(it);
|
||||
drop(rocksdb);
|
||||
let config = RocksDBConfig {
|
||||
path: dir.path().into(),
|
||||
create: false,
|
||||
read_only: true,
|
||||
};
|
||||
RocksDB::open::<MetaMergeOp>(&config).unwrap();
|
||||
}
|
||||
}
|
||||
84
src/storage/chunk_engine/src/types/chunk_meta.rs
Normal file
84
src/storage/chunk_engine/src/types/chunk_meta.rs
Normal file
@@ -0,0 +1,84 @@
|
||||
use super::super::*;
|
||||
|
||||
pub type ETag = tinyvec::TinyVec<[u8; 14]>;
|
||||
|
||||
#[derive(derse::Serialize, derse::Deserialize, Clone, PartialEq, Eq, Debug)]
|
||||
#[repr(C)]
|
||||
pub struct ChunkMeta {
|
||||
pub pos: Position,
|
||||
pub chain_ver: u32,
|
||||
pub chunk_ver: u32,
|
||||
pub len: u32,
|
||||
pub checksum: u32,
|
||||
pub timestamp: u64,
|
||||
pub last_request_id: u64,
|
||||
pub last_client_low: u64,
|
||||
pub last_client_high: u64,
|
||||
pub etag: ETag,
|
||||
pub uncommitted: bool,
|
||||
}
|
||||
|
||||
impl ChunkMeta {
|
||||
pub fn now() -> u64 {
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_micros() as _
|
||||
}
|
||||
|
||||
pub fn set_default_etag_if_need(&mut self) {
|
||||
if self.etag.is_empty() {
|
||||
self.etag = ETag::from(format!("{:X}", self.checksum).as_bytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ChunkMeta {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pos: Position::new(GroupId::new(Size::GB, 0, 0), 0),
|
||||
chain_ver: 0,
|
||||
chunk_ver: 0,
|
||||
len: 0,
|
||||
checksum: 0,
|
||||
timestamp: Self::now(),
|
||||
last_request_id: 0,
|
||||
last_client_low: 0,
|
||||
last_client_high: 0,
|
||||
etag: Default::default(),
|
||||
uncommitted: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use derse::{Deserialize, Serialize};
|
||||
|
||||
#[test]
|
||||
fn test_chunk_meta_seralization() {
|
||||
let ser = ChunkMeta {
|
||||
pos: Position::new(GroupId::default(), 88),
|
||||
chain_ver: 1,
|
||||
chunk_ver: 1,
|
||||
len: 2,
|
||||
timestamp: 0,
|
||||
etag: ETag::from(b"hello".as_slice()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let bytes: derse::DownwardBytes = ser.serialize().unwrap();
|
||||
assert_eq!(
|
||||
bytes.as_slice(),
|
||||
&[
|
||||
63, 88, 0, 0, 0, 0, 0, 8, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 5, b'h', b'e', b'l', b'l', b'o', 0,
|
||||
]
|
||||
);
|
||||
|
||||
let der = ChunkMeta::deserialize(&bytes[..]).unwrap();
|
||||
assert_eq!(ser, der);
|
||||
}
|
||||
}
|
||||
8
src/storage/chunk_engine/src/types/constants.rs
Normal file
8
src/storage/chunk_engine/src/types/constants.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
use super::super::Size;
|
||||
|
||||
pub const CHUNK_SIZE_SMALL: Size = Size::kibibyte(64);
|
||||
pub const CHUNK_SIZE_NORMAL: Size = Size::kibibyte(512);
|
||||
pub const CHUNK_SIZE_LARGE: Size = Size::mebibyte(4);
|
||||
pub const CHUNK_SIZE_ULTRA: Size = Size::mebibyte(64);
|
||||
pub const CHUNK_SIZE_SHIFT: usize = 16; // 64KiB is 2^16
|
||||
pub const CHUNK_SIZE_NUMBER: usize = 11; // from 64KiB to 64MiB
|
||||
114
src/storage/chunk_engine/src/types/group_id.rs
Normal file
114
src/storage/chunk_engine/src/types/group_id.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use super::super::*;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
|
||||
pub struct GroupId(pub u64);
|
||||
|
||||
impl Default for GroupId {
|
||||
fn default() -> Self {
|
||||
GroupId::new(CHUNK_SIZE_NORMAL, 0, 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl GroupId {
|
||||
// 32bit chunk size + 24bit group + 8bit cluster
|
||||
const SHIFT: u32 = 8;
|
||||
pub const COUNT: u32 = (1 << Self::SHIFT);
|
||||
|
||||
pub const fn new(chunk_size: Size, cluster: u8, group: u32) -> Self {
|
||||
Self(chunk_size.0 << 32 | (group << Self::SHIFT | cluster as u32) as u64)
|
||||
}
|
||||
|
||||
pub const fn chunk_size(&self) -> Size {
|
||||
Size::new(self.0 >> 32)
|
||||
}
|
||||
|
||||
pub const fn cluster(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
pub const fn group(&self) -> u32 {
|
||||
(self.0 as u32) >> Self::SHIFT
|
||||
}
|
||||
|
||||
pub fn offset(&self) -> Size {
|
||||
const MARKS: u64 = !(GroupId::COUNT - 1) as u64;
|
||||
self.chunk_size() * (self.0 & MARKS)
|
||||
}
|
||||
|
||||
pub fn size(&self) -> Size {
|
||||
self.chunk_size() * GroupId::COUNT as u64
|
||||
}
|
||||
|
||||
pub fn plus_one(&self) -> Self {
|
||||
Self(self.0 + 1)
|
||||
}
|
||||
|
||||
pub fn next(&mut self) {
|
||||
self.0 += 1
|
||||
}
|
||||
|
||||
pub const fn inner(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for GroupId {
|
||||
fn from(value: u64) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GroupId> for u64 {
|
||||
fn from(val: GroupId) -> Self {
|
||||
val.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for GroupId {
|
||||
type Target = u64;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for GroupId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"GroupId {{ chunk_size: {}, cluster: {}, group: {} }}",
|
||||
self.chunk_size(),
|
||||
self.cluster(),
|
||||
self.group(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_group_id_next() {
|
||||
let mut group_id = GroupId::default();
|
||||
|
||||
for _ in 0..1000 {
|
||||
for i in 0..=255 {
|
||||
let next = group_id.plus_one();
|
||||
if i == 255 {
|
||||
assert_eq!(group_id.chunk_size(), next.chunk_size());
|
||||
assert_eq!(0, next.cluster());
|
||||
assert_eq!(group_id.group() + 1, next.group());
|
||||
} else {
|
||||
assert_eq!(group_id.chunk_size(), next.chunk_size());
|
||||
assert_eq!(group_id.cluster() + 1, next.cluster());
|
||||
assert_eq!(group_id.group(), next.group());
|
||||
}
|
||||
group_id = next;
|
||||
}
|
||||
}
|
||||
|
||||
let value = u64::from(group_id);
|
||||
assert_eq!(value, group_id.0);
|
||||
}
|
||||
}
|
||||
163
src/storage/chunk_engine/src/types/group_state.rs
Normal file
163
src/storage/chunk_engine/src/types/group_state.rs
Normal file
@@ -0,0 +1,163 @@
|
||||
use super::super::*;
|
||||
use std::num::NonZeroU64;
|
||||
|
||||
type Item = u64;
|
||||
type Bits = [Item; 4];
|
||||
|
||||
#[derive(Debug, PartialEq, Copy, Clone)]
|
||||
pub struct GroupState {
|
||||
bits: Bits,
|
||||
count: u32,
|
||||
}
|
||||
|
||||
impl GroupState {
|
||||
const TOTAL_BYTES: usize = std::mem::size_of::<Bits>();
|
||||
pub const TOTAL_BITS: usize = 8 * Self::TOTAL_BYTES;
|
||||
pub const ITEM_BITS: u8 = 8 * std::mem::size_of::<Item>() as u8;
|
||||
pub const LEN: usize = Self::TOTAL_BYTES / std::mem::size_of::<Item>();
|
||||
pub const LEVELS: usize = 4;
|
||||
|
||||
pub fn from(value: &[u8]) -> Result<Self> {
|
||||
let mut out = Self::empty();
|
||||
if value.len() != Self::TOTAL_BYTES {
|
||||
return Err(Error::MetaError(format!(
|
||||
"group state load bytes {} != {}",
|
||||
value.len(),
|
||||
Self::TOTAL_BYTES
|
||||
)));
|
||||
}
|
||||
out.as_mut_bytes().copy_from_slice(value);
|
||||
out.count = out.bits.iter().map(|b| b.count_ones()).sum();
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
pub const fn empty() -> Self {
|
||||
Self {
|
||||
bits: [0; Self::LEN],
|
||||
count: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn full() -> Self {
|
||||
Self {
|
||||
bits: [!0; Self::LEN],
|
||||
count: Self::TOTAL_BITS as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.count == 0
|
||||
}
|
||||
|
||||
pub fn is_full(&self) -> bool {
|
||||
self.count == Self::TOTAL_BITS as u32
|
||||
}
|
||||
|
||||
pub fn allocate(&mut self) -> Option<u8> {
|
||||
for (i, v) in self.bits.iter_mut().enumerate() {
|
||||
if let Some(mark) = NonZeroU64::new(!*v) {
|
||||
let idx = mark.trailing_zeros();
|
||||
*v |= 1 << idx;
|
||||
self.count += 1;
|
||||
return Some(i as u8 * Self::ITEM_BITS + idx as u8);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn count(&self) -> u32 {
|
||||
self.count
|
||||
}
|
||||
|
||||
pub fn level(&self) -> u32 {
|
||||
self.count() / (Self::TOTAL_BITS / Self::LEVELS) as u32
|
||||
}
|
||||
|
||||
pub fn check(&self, index: u8) -> bool {
|
||||
let x = index / Self::ITEM_BITS;
|
||||
let y = index % Self::ITEM_BITS;
|
||||
self.bits[x as usize] & (1 << y) != 0
|
||||
}
|
||||
|
||||
pub fn deallocate(&mut self, index: u8) -> Result<()> {
|
||||
let x = index / Self::ITEM_BITS;
|
||||
let y = index % Self::ITEM_BITS;
|
||||
let mark = &mut self.bits[x as usize];
|
||||
if *mark & (1 << y) != 0 {
|
||||
*mark ^= 1 << y;
|
||||
self.count -= 1;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(Error::MetaError(format!(
|
||||
"group state deallocate fail: index {}",
|
||||
index
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&mut self, merge_bits: &MergeState) {
|
||||
for pos in &merge_bits.acquire {
|
||||
let x = pos / Self::ITEM_BITS;
|
||||
let y = pos % Self::ITEM_BITS;
|
||||
self.bits[x as usize] |= 1 << y;
|
||||
}
|
||||
for pos in &merge_bits.release {
|
||||
let x = pos / Self::ITEM_BITS;
|
||||
let y = pos % Self::ITEM_BITS;
|
||||
self.bits[x as usize] &= !(1 << y);
|
||||
}
|
||||
self.count = self.bits.iter().map(|b| b.count_ones()).sum();
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8; Self::TOTAL_BYTES] {
|
||||
unsafe { std::mem::transmute(&self.bits) }
|
||||
}
|
||||
|
||||
pub fn as_mut_bytes(&mut self) -> &mut [u8; Self::TOTAL_BYTES] {
|
||||
unsafe { std::mem::transmute(&mut self.bits) }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_group_bits_normal() {
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
let mut group_state = GroupState::empty();
|
||||
assert_eq!(group_state.count(), 0);
|
||||
|
||||
for i in 0..=255 {
|
||||
assert_eq!(i, group_state.allocate().unwrap());
|
||||
}
|
||||
assert!(group_state.allocate().is_none());
|
||||
assert_eq!(group_state.count(), 256);
|
||||
|
||||
let mut vec = (0..=255).collect::<Vec<u8>>();
|
||||
vec.shuffle(&mut rand::thread_rng());
|
||||
for i in vec {
|
||||
group_state.deallocate(i).unwrap();
|
||||
group_state.deallocate(i).unwrap_err();
|
||||
|
||||
let j = group_state.allocate().unwrap();
|
||||
group_state.deallocate(j).unwrap();
|
||||
group_state.deallocate(j).unwrap_err();
|
||||
}
|
||||
assert_eq!(group_state.count(), 0);
|
||||
|
||||
group_state.allocate().unwrap();
|
||||
group_state.allocate().unwrap();
|
||||
group_state.deallocate(0).unwrap();
|
||||
assert!(group_state.check(1));
|
||||
|
||||
let bytes = group_state.as_bytes();
|
||||
let another_state = GroupState::from(bytes).unwrap();
|
||||
assert_eq!(another_state, group_state);
|
||||
|
||||
assert!(GroupState::from(&bytes[1..]).is_err());
|
||||
}
|
||||
}
|
||||
89
src/storage/chunk_engine/src/types/merge_state.rs
Normal file
89
src/storage/chunk_engine/src/types/merge_state.rs
Normal file
@@ -0,0 +1,89 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::super::*;
|
||||
use derse::Deserialize;
|
||||
|
||||
#[derive(Clone, Debug, Default, derse::Deserialize, derse::Serialize, PartialEq)]
|
||||
pub struct MergeState {
|
||||
pub acquire: HashSet<u8>,
|
||||
pub release: HashSet<u8>,
|
||||
}
|
||||
|
||||
impl MergeState {
|
||||
pub fn empty() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn from(value: &[u8]) -> Result<Self> {
|
||||
Self::deserialize(value).map_err(Error::SerializationError)
|
||||
}
|
||||
|
||||
pub fn acquire(pos: u8) -> Self {
|
||||
let mut b = Self::empty();
|
||||
b.acquire.insert(pos);
|
||||
b
|
||||
}
|
||||
|
||||
pub fn release(pos: u8) -> Self {
|
||||
let mut b = Self::empty();
|
||||
b.release.insert(pos);
|
||||
b
|
||||
}
|
||||
|
||||
pub fn merge(&mut self, right: &Self) {
|
||||
for pos in &right.acquire {
|
||||
self.acquire.insert(*pos);
|
||||
self.release.remove(pos);
|
||||
}
|
||||
for pos in &right.release {
|
||||
self.acquire.remove(pos);
|
||||
self.release.insert(*pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_merge_bits() {
|
||||
fn group_bits_apply(mut bits: GroupState, merge_bits: &MergeState) -> GroupState {
|
||||
bits.update(merge_bits);
|
||||
bits
|
||||
}
|
||||
|
||||
let state = GroupState::empty();
|
||||
|
||||
assert_eq!(group_bits_apply(state, &MergeState::empty()), state);
|
||||
|
||||
let acquire_first_bit = MergeState::acquire(0);
|
||||
let state_after_acquire = group_bits_apply(state, &acquire_first_bit);
|
||||
assert_eq!(state_after_acquire.as_bytes()[0], 1);
|
||||
assert_eq!(state_after_acquire.as_bytes()[1..], state.as_bytes()[1..]);
|
||||
|
||||
let release_first_bit = MergeState::release(0);
|
||||
let state_after_release = group_bits_apply(state_after_acquire, &release_first_bit);
|
||||
assert_eq!(state_after_release, state);
|
||||
|
||||
let mut merge_bits = acquire_first_bit;
|
||||
merge_bits.merge(&release_first_bit);
|
||||
assert_eq!(merge_bits, release_first_bit);
|
||||
assert_eq!(state, group_bits_apply(state, &merge_bits));
|
||||
|
||||
let mut merge_bits = MergeState::empty();
|
||||
for i in 0..=255 {
|
||||
merge_bits.merge(&MergeState::acquire(i));
|
||||
}
|
||||
let full_state = group_bits_apply(state, &merge_bits);
|
||||
assert!(full_state.is_full());
|
||||
|
||||
for i in 0..=255 {
|
||||
merge_bits.merge(&MergeState::release(i));
|
||||
}
|
||||
let empty_state = group_bits_apply(full_state, &merge_bits);
|
||||
assert_eq!(empty_state, state);
|
||||
|
||||
assert!(MergeState::from(&[]).is_err());
|
||||
}
|
||||
}
|
||||
13
src/storage/chunk_engine/src/types/mod.rs
Normal file
13
src/storage/chunk_engine/src/types/mod.rs
Normal file
@@ -0,0 +1,13 @@
|
||||
mod chunk_meta;
|
||||
mod constants;
|
||||
mod group_id;
|
||||
mod group_state;
|
||||
mod merge_state;
|
||||
mod position;
|
||||
|
||||
pub use chunk_meta::*;
|
||||
pub use constants::*;
|
||||
pub use group_id::*;
|
||||
pub use group_state::*;
|
||||
pub use merge_state::*;
|
||||
pub use position::*;
|
||||
117
src/storage/chunk_engine/src/types/position.rs
Normal file
117
src/storage/chunk_engine/src/types/position.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use super::super::*;
|
||||
|
||||
use derse::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
|
||||
#[repr(C)]
|
||||
pub struct Position(pub u64);
|
||||
|
||||
impl Position {
|
||||
const SHIFT: u32 = 8;
|
||||
|
||||
// 24bit chunk size + 8bit cluster + 24bit group + 8bit zero
|
||||
pub const fn new(group_id: GroupId, index: u8) -> Self {
|
||||
const CLEAN: u64 = !((GroupId::COUNT - 1) as u64);
|
||||
Self(group_id.inner() & CLEAN | index as u64 | (group_id.cluster() as u64) << 32)
|
||||
}
|
||||
|
||||
pub fn group_id(&self) -> GroupId {
|
||||
const MARKS: u64 = (GroupId::COUNT - 1) as u64;
|
||||
const CLEAN: u64 = !(MARKS | MARKS << 32);
|
||||
GroupId::from(self.0 & CLEAN | self.cluster() as u64)
|
||||
}
|
||||
|
||||
pub fn chunk_size(&self) -> Size {
|
||||
Size::new(self.0 >> 40 << 8)
|
||||
}
|
||||
|
||||
pub fn cluster(&self) -> u8 {
|
||||
(self.0 >> 32) as u8
|
||||
}
|
||||
|
||||
pub fn group(&self) -> u32 {
|
||||
(self.0 as u32) >> Self::SHIFT
|
||||
}
|
||||
|
||||
pub fn index(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
pub fn offset(&self) -> Size {
|
||||
self.chunk_size() * self.0 as u32 as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Position {
|
||||
fn default() -> Self {
|
||||
Position::new(GroupId::default(), 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Position {
|
||||
fn from(value: u64) -> Self {
|
||||
Self(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Position {
|
||||
type Target = u64;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Position {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Position {{ chunk_size: {}, cluster: {}, group: {}, index: {} }}",
|
||||
self.chunk_size(),
|
||||
self.cluster(),
|
||||
self.group(),
|
||||
self.index(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Position {
|
||||
fn serialize_to<T: Serializer>(&self, serializer: &mut T) -> derse::Result<()> {
|
||||
self.0.serialize_to(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deserialize<'a> for Position {
|
||||
fn deserialize_from<T: Deserializer<'a>>(buf: &mut T) -> derse::Result<Self> {
|
||||
Ok(Self(u64::deserialize_from(buf)?))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_group_id_and_position() {
|
||||
let group_id = GroupId::new(64 * Size::KB, 23, 233);
|
||||
assert_eq!(group_id.chunk_size(), 64 * Size::KB);
|
||||
assert_eq!(group_id.cluster(), 23);
|
||||
assert_eq!(group_id.group(), 233);
|
||||
assert_eq!(
|
||||
format!("{:?}", group_id),
|
||||
"GroupId { chunk_size: 64KiB, cluster: 23, group: 233 }"
|
||||
);
|
||||
|
||||
let position = Position::new(group_id, 223);
|
||||
assert_eq!(position.chunk_size(), 64 * Size::KB);
|
||||
assert_eq!(position.cluster(), 23);
|
||||
assert_eq!(position.group(), 233);
|
||||
assert_eq!(position.index(), 223);
|
||||
assert_eq!(position.group_id(), group_id);
|
||||
assert_eq!(position.to_be_bytes().len(), 8);
|
||||
assert_eq!(
|
||||
format!("{:?}", position),
|
||||
"Position { chunk_size: 64KiB, cluster: 23, group: 233, index: 223 }"
|
||||
);
|
||||
}
|
||||
}
|
||||
21
src/storage/chunk_engine/src/utils/aligned.rs
Normal file
21
src/storage/chunk_engine/src/utils/aligned.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use super::super::Size;
|
||||
|
||||
pub const ALIGN_SIZE: Size = Size::new(512);
|
||||
|
||||
pub fn create_aligned_vec(size: Size) -> Vec<u8> {
|
||||
let s: usize = size.into();
|
||||
let layout = std::alloc::Layout::from_size_align(s, ALIGN_SIZE.into()).unwrap();
|
||||
unsafe { Vec::from_raw_parts(std::alloc::alloc(layout), s, s) }
|
||||
}
|
||||
|
||||
pub fn is_aligned_buf(data: &[u8]) -> bool {
|
||||
data.as_ptr() as u64 % ALIGN_SIZE.0 == 0 && data.len() as u64 % ALIGN_SIZE.0 == 0
|
||||
}
|
||||
|
||||
pub fn is_aligned_len(len: u32) -> bool {
|
||||
len % ALIGN_SIZE.0 as u32 == 0
|
||||
}
|
||||
|
||||
pub fn is_aligned_io(data: &[u8], offset: u32) -> bool {
|
||||
is_aligned_buf(data) && is_aligned_len(offset)
|
||||
}
|
||||
1
src/storage/chunk_engine/src/utils/bytes.rs
Normal file
1
src/storage/chunk_engine/src/utils/bytes.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub type Bytes = tinyvec::TinyVec<[u8; 28]>;
|
||||
15
src/storage/chunk_engine/src/utils/mod.rs
Normal file
15
src/storage/chunk_engine/src/utils/mod.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
mod aligned;
|
||||
mod bytes;
|
||||
mod result;
|
||||
mod shards_map;
|
||||
mod shards_set;
|
||||
mod size;
|
||||
mod worker;
|
||||
|
||||
pub use aligned::*;
|
||||
pub use bytes::*;
|
||||
pub use result::*;
|
||||
pub use shards_map::*;
|
||||
pub use shards_set::*;
|
||||
pub use size::*;
|
||||
pub use worker::*;
|
||||
34
src/storage/chunk_engine/src/utils/result.rs
Normal file
34
src/storage/chunk_engine/src/utils/result.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Error {
|
||||
IoError(String),
|
||||
RocksDBError(String),
|
||||
MetaError(String),
|
||||
InvalidArg(String),
|
||||
SerializationError(derse::Error),
|
||||
ChecksumMismatch(String),
|
||||
ChainVersionMismatch(String),
|
||||
ChunkETagMismatch(String),
|
||||
ChunkAlreadyExists,
|
||||
ChunkCommittedUpdate(String),
|
||||
ChunkMissingUpdate(String),
|
||||
NoSpace,
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl std::fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_error_display() {
|
||||
let error = Error::InvalidArg("invalid pos".into());
|
||||
assert_eq!(error.to_string(), r#"InvalidArg("invalid pos")"#);
|
||||
}
|
||||
}
|
||||
153
src/storage/chunk_engine/src/utils/shards_map.rs
Normal file
153
src/storage/chunk_engine/src/utils/shards_map.rs
Normal file
@@ -0,0 +1,153 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::{
|
||||
hash_map::{DefaultHasher, Entry},
|
||||
HashMap,
|
||||
};
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
pub struct ShardsMap<K, V, const S: usize = 64> {
|
||||
shards: [HashMap<K, V>; S],
|
||||
}
|
||||
|
||||
pub struct ShardsMapIter<'a, K, V> {
|
||||
array_it: std::slice::Iter<'a, HashMap<K, V>>,
|
||||
inner_it: std::collections::hash_map::Iter<'a, K, V>,
|
||||
}
|
||||
|
||||
impl<K, V, const S: usize> ShardsMap<K, V, S>
|
||||
where
|
||||
K: Eq + Hash,
|
||||
{
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
shards: [(); S].map(|_| Default::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
let cap = (capacity / S).next_power_of_two();
|
||||
Self {
|
||||
shards: [(); S].map(|_| HashMap::with_capacity(cap)),
|
||||
}
|
||||
}
|
||||
|
||||
fn shard<Q>(key: &Q) -> usize
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
let mut s = DefaultHasher::new();
|
||||
key.hash(&mut s);
|
||||
s.finish() as usize % S
|
||||
}
|
||||
|
||||
pub fn get<Q>(&self, k: &Q) -> Option<&V>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
self.shards[Self::shard(k)].get(k)
|
||||
}
|
||||
|
||||
pub fn get_mut<Q>(&mut self, k: &Q) -> Option<&mut V>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
self.shards[Self::shard(k)].get_mut(k)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.shards.iter().all(|m| m.is_empty())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.shards.iter().map(|m| m.len()).sum()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> ShardsMapIter<'_, K, V> {
|
||||
ShardsMapIter {
|
||||
array_it: self.shards[1..].iter(),
|
||||
inner_it: self.shards[0].iter(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, k: K, v: V) -> Option<V> {
|
||||
self.shards[Self::shard(&k)].insert(k, v)
|
||||
}
|
||||
|
||||
pub fn entry(&mut self, k: K) -> Entry<'_, K, V> {
|
||||
self.shards[Self::shard(&k)].entry(k)
|
||||
}
|
||||
|
||||
pub fn remove<Q>(&mut self, k: &Q) -> Option<V>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
self.shards[Self::shard(k)].remove(k)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V, const S: usize> Default for ShardsMap<K, V, S>
|
||||
where
|
||||
K: Eq + Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V> Iterator for ShardsMapIter<'a, K, V> {
|
||||
type Item = (&'a K, &'a V);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(value) = self.inner_it.next() {
|
||||
return Some(value);
|
||||
} else if let Some(map) = self.array_it.next() {
|
||||
self.inner_it = map.iter();
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_shards_map() {
|
||||
let mut map = ShardsMap::<usize, usize, 4>::with_capacity(1024);
|
||||
assert!(map.is_empty());
|
||||
assert_eq!(map.len(), 0);
|
||||
|
||||
const N: usize = 1024;
|
||||
for i in 0..N {
|
||||
assert!(map.get(&i).is_none());
|
||||
map.insert(i, i * i);
|
||||
}
|
||||
assert!(!map.is_empty());
|
||||
assert_eq!(map.len(), N);
|
||||
|
||||
assert_eq!(
|
||||
map.iter()
|
||||
.map(|(k, v)| {
|
||||
assert_eq!(k * k, *v);
|
||||
})
|
||||
.count(),
|
||||
N
|
||||
);
|
||||
|
||||
for i in 0..N {
|
||||
let value = map.get_mut(&i).unwrap();
|
||||
assert_eq!(i * i, *value);
|
||||
map.entry(i).and_modify(|v| *v += 1);
|
||||
assert_eq!(map.remove(&i).unwrap(), i * i + 1);
|
||||
}
|
||||
|
||||
assert!(ShardsMap::<usize, usize, 4>::default().is_empty());
|
||||
}
|
||||
}
|
||||
128
src/storage/chunk_engine/src/utils/shards_set.rs
Normal file
128
src/storage/chunk_engine/src/utils/shards_set.rs
Normal file
@@ -0,0 +1,128 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::{hash_map::DefaultHasher, HashSet};
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
pub struct ShardsSet<T, const S: usize = 64> {
|
||||
shards: [HashSet<T>; S],
|
||||
}
|
||||
|
||||
pub struct ShardsSetIter<'a, T> {
|
||||
array_it: std::slice::Iter<'a, HashSet<T>>,
|
||||
inner_it: std::collections::hash_set::Iter<'a, T>,
|
||||
}
|
||||
|
||||
impl<T, const S: usize> ShardsSet<T, S>
|
||||
where
|
||||
T: Eq + Hash,
|
||||
{
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
shards: [(); S].map(|_| Default::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
let cap = (capacity / S).next_power_of_two();
|
||||
Self {
|
||||
shards: [(); S].map(|_| HashSet::with_capacity(cap)),
|
||||
}
|
||||
}
|
||||
|
||||
fn shard<Q>(key: &Q) -> usize
|
||||
where
|
||||
T: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
let mut s = DefaultHasher::new();
|
||||
key.hash(&mut s);
|
||||
s.finish() as usize % S
|
||||
}
|
||||
|
||||
pub fn contains<Q>(&self, value: &Q) -> bool
|
||||
where
|
||||
T: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
self.shards[Self::shard(value)].contains(value)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.shards.iter().all(|m| m.is_empty())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.shards.iter().map(|m| m.len()).sum()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> ShardsSetIter<'_, T> {
|
||||
ShardsSetIter {
|
||||
array_it: self.shards[1..].iter(),
|
||||
inner_it: self.shards[0].iter(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, value: T) -> bool {
|
||||
self.shards[Self::shard(&value)].insert(value)
|
||||
}
|
||||
|
||||
pub fn remove<Q>(&mut self, value: &Q) -> bool
|
||||
where
|
||||
T: Borrow<Q>,
|
||||
Q: Eq + Hash + ?Sized,
|
||||
{
|
||||
self.shards[Self::shard(value)].remove(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, const S: usize> Default for ShardsSet<T, S>
|
||||
where
|
||||
T: Eq + Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Iterator for ShardsSetIter<'a, T> {
|
||||
type Item = &'a T;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some(value) = self.inner_it.next() {
|
||||
return Some(value);
|
||||
} else if let Some(map) = self.array_it.next() {
|
||||
self.inner_it = map.iter();
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_shards_map() {
|
||||
let mut set = ShardsSet::<usize, 4>::with_capacity(1024);
|
||||
assert!(set.is_empty());
|
||||
assert_eq!(set.len(), 0);
|
||||
|
||||
const N: usize = 1024;
|
||||
for i in 0..N {
|
||||
assert!(!set.contains(&i));
|
||||
assert!(set.insert(i));
|
||||
}
|
||||
assert!(!set.is_empty());
|
||||
assert_eq!(set.len(), N);
|
||||
|
||||
for i in 0..N {
|
||||
assert!(set.contains(&i));
|
||||
assert!(set.remove(&i));
|
||||
assert!(!set.remove(&i));
|
||||
}
|
||||
|
||||
assert!(ShardsSet::<usize>::default().is_empty());
|
||||
}
|
||||
}
|
||||
283
src/storage/chunk_engine/src/utils/size.rs
Normal file
283
src/storage/chunk_engine/src/utils/size.rs
Normal file
@@ -0,0 +1,283 @@
|
||||
#[derive(Default, Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
|
||||
#[repr(C)]
|
||||
pub struct Size(pub u64);
|
||||
|
||||
impl Size {
|
||||
pub const B: Size = Size::byte(1);
|
||||
pub const KB: Size = Size::kibibyte(1);
|
||||
pub const MB: Size = Size::mebibyte(1);
|
||||
pub const GB: Size = Size::gibibyte(1);
|
||||
pub const TB: Size = Size::tebibyte(1);
|
||||
|
||||
pub const fn new(v: u64) -> Size {
|
||||
Size(v)
|
||||
}
|
||||
|
||||
pub const fn zero() -> Size {
|
||||
Size::new(0)
|
||||
}
|
||||
|
||||
pub const fn byte(value: u64) -> Size {
|
||||
Size::new(value)
|
||||
}
|
||||
|
||||
pub const fn kibibyte(value: u64) -> Size {
|
||||
Size::new(value << 10)
|
||||
}
|
||||
|
||||
pub const fn mebibyte(value: u64) -> Size {
|
||||
Size::new(value << 20)
|
||||
}
|
||||
|
||||
pub const fn gibibyte(value: u64) -> Size {
|
||||
Size::new(value << 30)
|
||||
}
|
||||
|
||||
pub const fn tebibyte(value: u64) -> Size {
|
||||
Size::new(value << 40)
|
||||
}
|
||||
|
||||
pub fn around(&self) -> String {
|
||||
if self.0 == 0 {
|
||||
"0B".to_string()
|
||||
} else if *self * 2 >= Self::TB {
|
||||
format!("{:.2}TiB", (self.0 as f64 / Self::TB.0 as f64))
|
||||
} else if *self * 2 >= Self::GB {
|
||||
format!("{:.2}GiB", (self.0 as f64 / Self::GB.0 as f64))
|
||||
} else if *self * 2 >= Self::MB {
|
||||
format!("{:.2}MiB", (self.0 as f64 / Self::MB.0 as f64))
|
||||
} else if *self * 2 >= Self::KB {
|
||||
format!("{:.2}KiB", (self.0 as f64 / Self::KB.0 as f64))
|
||||
} else {
|
||||
format!("{}B", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_power_of_two(&self) -> bool {
|
||||
self.0.is_power_of_two()
|
||||
}
|
||||
|
||||
pub fn next_power_of_two(&self) -> Size {
|
||||
Size(self.0.next_power_of_two())
|
||||
}
|
||||
|
||||
pub fn trailing_zeros(&self) -> u32 {
|
||||
self.0.trailing_zeros()
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! impl_trait_for_size {
|
||||
($($t:ty),*) => {
|
||||
$(impl From<$t> for Size {
|
||||
fn from(value: $t) -> Self {
|
||||
Self::new(value as _)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Size> for $t {
|
||||
fn from(val: Size) -> Self {
|
||||
val.0 as _
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<$t> for Size {
|
||||
fn eq(&self, other: &$t) -> bool {
|
||||
self.0 == *other as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<Size> for $t {
|
||||
fn eq(&self, other: &Size) -> bool {
|
||||
*self as u64 == other.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add<$t> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn add(self, rhs: $t) -> Self::Output {
|
||||
Size::new(self.0 + rhs as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add<Size> for $t {
|
||||
type Output = Size;
|
||||
|
||||
fn add(self, rhs: Size) -> Self::Output {
|
||||
Size::new(self as u64 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::AddAssign<$t> for Size {
|
||||
fn add_assign(&mut self, rhs: $t) {
|
||||
self.0 += rhs as u64;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Mul<$t> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn mul(self, rhs: $t) -> Self::Output {
|
||||
Size::new(self.0 * rhs as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Mul<Size> for $t {
|
||||
type Output = Size;
|
||||
|
||||
fn mul(self, rhs: Size) -> Self::Output {
|
||||
Size::new(self as u64 * rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::MulAssign<$t> for Size {
|
||||
fn mul_assign(&mut self, rhs: $t) {
|
||||
self.0 *= rhs as u64;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Rem<$t> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn rem(self, rhs: $t) -> Self::Output {
|
||||
Size::new(self.0 % rhs as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Rem<Size> for $t {
|
||||
type Output = Size;
|
||||
|
||||
fn rem(self, rhs: Size) -> Self::Output {
|
||||
Size::new(self as u64 % rhs.0)
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
impl_trait_for_size! {i32, i64, u32, u64, usize}
|
||||
|
||||
impl std::ops::Add<Self> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Size::new(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::AddAssign<Self> for Size {
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
self.0 += rhs.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Sub<Self> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
Size::new(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Mul<Self> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn mul(self, rhs: Self) -> Self::Output {
|
||||
Size::new(self.0 * rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Div<Self> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn div(self, rhs: Self) -> Self::Output {
|
||||
Size::new(self.0 / rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Rem<Self> for Size {
|
||||
type Output = Size;
|
||||
|
||||
fn rem(self, rhs: Self) -> Self::Output {
|
||||
Size::new(self.0 % rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Size {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.0 == 0 {
|
||||
write!(f, "0B")
|
||||
} else if *self % Self::TB == 0 {
|
||||
write!(f, "{}TiB", (*self / Self::TB).0)
|
||||
} else if *self % Self::GB == 0 {
|
||||
write!(f, "{}GiB", (*self / Self::GB).0)
|
||||
} else if *self % Self::MB == 0 {
|
||||
write!(f, "{}MiB", (*self / Self::MB).0)
|
||||
} else if *self % Self::KB == 0 {
|
||||
write!(f, "{}KiB", (*self / Self::KB).0)
|
||||
} else {
|
||||
write!(f, "{}B", self.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Size {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.around())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_size() {
|
||||
use super::Size;
|
||||
|
||||
let size = Size::zero();
|
||||
assert_eq!(size, Size::new(0));
|
||||
assert_eq!(size.to_string(), "0B".to_string());
|
||||
|
||||
let size = Size::kibibyte(64);
|
||||
assert_eq!(size, Size::new(65536));
|
||||
assert_eq!(size.to_string(), "64KiB".to_string());
|
||||
|
||||
let size: Size = Size::MB * 23;
|
||||
assert_eq!(size, Size::new(23 << 20));
|
||||
assert_eq!(size.to_string(), "23MiB".to_string());
|
||||
|
||||
let size: Size = 233 * Size::GB;
|
||||
assert_eq!(size, Size::new(233 << 30));
|
||||
assert_eq!(size.to_string(), "233GiB".to_string());
|
||||
|
||||
assert_eq!(format!("{}", Size::zero()), "0B".to_string());
|
||||
assert_eq!(format!("{}", Size::byte(233)), "233B".to_string());
|
||||
assert_eq!(format!("{}", Size::byte(512)), "512B".to_string());
|
||||
assert_eq!(format!("{}", Size::kibibyte(512)), "512KiB".to_string());
|
||||
assert_eq!(format!("{}", Size::mebibyte(512)), "512MiB".to_string());
|
||||
assert_eq!(format!("{}", Size::gibibyte(512)), "512GiB".to_string());
|
||||
assert_eq!(format!("{}", Size::tebibyte(512)), "512TiB".to_string());
|
||||
|
||||
assert_eq!(format!("{:?}", Size::zero()), "0B".to_string());
|
||||
assert_eq!(format!("{:?}", Size::byte(233)), "233B".to_string());
|
||||
assert_eq!(format!("{:?}", Size::byte(512)), "0.50KiB".to_string());
|
||||
assert_eq!(format!("{:?}", Size::kibibyte(512)), "0.50MiB".to_string());
|
||||
assert_eq!(format!("{:?}", Size::mebibyte(512)), "0.50GiB".to_string());
|
||||
assert_eq!(format!("{:?}", Size::gibibyte(512)), "0.50TiB".to_string());
|
||||
assert_eq!(format!("{:?}", Size::tebibyte(512)), "512.00TiB".to_owned());
|
||||
|
||||
let r = rand::random::<u64>() % 1024;
|
||||
assert_eq!(0 + Size::kibibyte(r), Size::from(r << 10));
|
||||
assert_eq!(Size::mebibyte(r) + 0, Size::from(r << 20));
|
||||
assert_eq!(1 * Size::gibibyte(r), Size::from(r << 30));
|
||||
assert_eq!(Size::tebibyte(r) * 1, Size::from(r << 40));
|
||||
|
||||
assert_eq!(Size::KB * Size::KB, Size::MB);
|
||||
let mut size = Size::B;
|
||||
size *= 1024;
|
||||
assert_eq!(size, Size::KB);
|
||||
assert_eq!(size % 1000, 24);
|
||||
|
||||
assert_eq!(Size::KB + Size::KB, Size::kibibyte(2));
|
||||
assert_eq!(Size::KB % 1000, Size(24));
|
||||
}
|
||||
}
|
||||
136
src/storage/chunk_engine/src/utils/worker.rs
Normal file
136
src/storage/chunk_engine/src/utils/worker.rs
Normal file
@@ -0,0 +1,136 @@
|
||||
use std::{
|
||||
sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc, Condvar, Mutex,
|
||||
},
|
||||
thread::JoinHandle,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum WorkerState {
|
||||
Continue,
|
||||
Pause,
|
||||
Wait(std::time::Duration),
|
||||
Stop,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct WorkerBuilder {
|
||||
name: Option<String>,
|
||||
condvar: Option<Arc<Condvar>>,
|
||||
}
|
||||
|
||||
impl WorkerBuilder {
|
||||
pub fn name(mut self, str: String) -> Self {
|
||||
self.name = Some(str);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn cond(mut self, condvar: Arc<Condvar>) -> Self {
|
||||
self.condvar = Some(condvar);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn spawn<F>(self, f: F) -> Worker
|
||||
where
|
||||
F: FnMut() -> WorkerState + Send + 'static,
|
||||
{
|
||||
Worker::new(f, self.name, self.condvar)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Worker {
|
||||
stopping: Arc<AtomicBool>,
|
||||
condvar: Arc<Condvar>,
|
||||
handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl Worker {
|
||||
pub fn new<F>(mut f: F, name: Option<String>, condvar: Option<Arc<Condvar>>) -> Worker
|
||||
where
|
||||
F: FnMut() -> WorkerState + Send + 'static,
|
||||
{
|
||||
let stopping = Arc::new(AtomicBool::default());
|
||||
let stopping_clone = stopping.clone();
|
||||
let condvar = condvar.unwrap_or_default();
|
||||
let condvar_clone = condvar.clone();
|
||||
|
||||
let builder = if let Some(name) = name {
|
||||
std::thread::Builder::new().name(name)
|
||||
} else {
|
||||
std::thread::Builder::new()
|
||||
};
|
||||
let handle = Some(
|
||||
builder
|
||||
.spawn(move || {
|
||||
let mutex = Mutex::new(());
|
||||
while !stopping_clone.load(Ordering::Acquire) {
|
||||
match f() {
|
||||
WorkerState::Continue => continue,
|
||||
WorkerState::Pause => {
|
||||
drop(condvar_clone.wait(mutex.lock().unwrap()).unwrap());
|
||||
}
|
||||
WorkerState::Wait(duration) => {
|
||||
drop(
|
||||
condvar_clone
|
||||
.wait_timeout(mutex.lock().unwrap(), duration)
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
WorkerState::Stop => break,
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
Worker {
|
||||
stopping,
|
||||
condvar,
|
||||
handle,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop_and_join(&mut self) {
|
||||
self.stopping.store(true, Ordering::Release);
|
||||
self.condvar.notify_all();
|
||||
if let Some(handle) = self.handle.take() {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_worker() {
|
||||
let count = Arc::new(std::sync::atomic::AtomicUsize::default());
|
||||
let condvar = Default::default();
|
||||
let count_clone = count.clone();
|
||||
let mut worker = WorkerBuilder::default()
|
||||
.name("Worker".into())
|
||||
.cond(condvar)
|
||||
.spawn(move || {
|
||||
if count_clone.fetch_add(1, Ordering::SeqCst) + 1 < 10 {
|
||||
WorkerState::Continue
|
||||
} else {
|
||||
WorkerState::Pause
|
||||
}
|
||||
});
|
||||
|
||||
while count.load(Ordering::Acquire) < 10 {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
}
|
||||
|
||||
worker.stop_and_join();
|
||||
assert_eq!(count.load(Ordering::Acquire), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_worker_2() {
|
||||
let worker = WorkerBuilder::default().spawn(move || WorkerState::Stop);
|
||||
let _ = worker.handle.unwrap().join();
|
||||
}
|
||||
}
|
||||
183
src/storage/service/BufferPool.cc
Normal file
183
src/storage/service/BufferPool.cc
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "storage/service/BufferPool.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/experimental/coro/Collect.h>
|
||||
#include <folly/experimental/coro/Invoke.h>
|
||||
#include <folly/experimental/coro/Task.h>
|
||||
#include <sys/uio.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/MagicEnum.hpp"
|
||||
#include "fbs/storage/Common.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
void alignBuffer(net::RDMABuf &rdmabuf) {
|
||||
auto address = reinterpret_cast<uint64_t>(rdmabuf.ptr());
|
||||
auto remain = address % kAIOAlignSize;
|
||||
if (remain == 0) {
|
||||
return;
|
||||
}
|
||||
auto crop = kAIOAlignSize - remain;
|
||||
rdmabuf.advance(std::min(crop, rdmabuf.size()));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<Void> BufferPool::init(CPUExecutorGroup &executor) {
|
||||
buffers_.clear();
|
||||
buffers_.reserve(UIO_MAXIOV);
|
||||
|
||||
auto smallBufferResult =
|
||||
initBuffers(executor, config_.rdmabuf_size(), config_.rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
|
||||
RETURN_AND_LOG_ON_ERROR(smallBufferResult);
|
||||
*freeIndex_.lock() = std::move(*smallBufferResult);
|
||||
|
||||
bigBufferRegisterIndexStart_ = buffers_.size();
|
||||
|
||||
auto bigBufferResult =
|
||||
initBuffers(executor, config_.big_rdmabuf_size(), config_.big_rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
|
||||
RETURN_AND_LOG_ON_ERROR(bigBufferResult);
|
||||
*bigFreeIndex_.lock() = std::move(*bigBufferResult);
|
||||
|
||||
iovecs_.clear();
|
||||
iovecs_.reserve(buffers_.size());
|
||||
for (auto &buf : buffers_) {
|
||||
iovecs_.push_back({(void *)buf.ptr(), buf.size()});
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<std::vector<BufferIndex>> BufferPool::initBuffers(CPUExecutorGroup &executor,
|
||||
Size rdmabufSize,
|
||||
uint32_t rdmabufCount,
|
||||
uint32_t limit,
|
||||
std::vector<net::RDMABuf> &outBuffers) {
|
||||
size_t totalSize = rdmabufSize * rdmabufCount;
|
||||
size_t bufferCount = std::min(limit, rdmabufCount);
|
||||
size_t smallBufferCount = (totalSize / bufferCount + rdmabufSize - 1) / rdmabufSize;
|
||||
size_t bufferSize = smallBufferCount * rdmabufSize;
|
||||
auto pool = net::RDMABufPool::create(bufferSize, bufferCount);
|
||||
|
||||
std::vector<folly::coro::TaskWithExecutor<net::RDMABuf>> tasks;
|
||||
tasks.reserve(bufferCount);
|
||||
for (auto i = 0u; i < bufferCount; ++i) {
|
||||
tasks.push_back(pool->allocate().scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
XLOGF(INFO, "allocate {} * {} RDMA buffers started", bufferCount, Size{bufferSize});
|
||||
auto buffers = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
XLOGF(INFO, "allocate {} * {} RDMA buffers finished", bufferCount, Size{bufferSize});
|
||||
|
||||
std::vector<BufferIndex> freeIndex;
|
||||
freeIndex.reserve(rdmabufCount);
|
||||
for (auto &buf : buffers) {
|
||||
if (UNLIKELY(!buf)) {
|
||||
auto msg = fmt::format("storage init buffer pool failed");
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
alignBuffer(buf);
|
||||
|
||||
BufferIndex bufferIndex;
|
||||
bufferIndex.registerIndex = outBuffers.size();
|
||||
outBuffers.push_back(buf);
|
||||
|
||||
auto split = buf;
|
||||
for (; split.size() >= rdmabufSize; split.advance(rdmabufSize)) {
|
||||
bufferIndex.buffer = split.first(rdmabufSize);
|
||||
freeIndex.push_back(bufferIndex);
|
||||
}
|
||||
}
|
||||
return Result<std::vector<BufferIndex>>(std::move(freeIndex));
|
||||
}
|
||||
|
||||
BufferPool::Buffer::~Buffer() {
|
||||
for (auto &index : indices_) {
|
||||
pool_->deallocate(index);
|
||||
}
|
||||
}
|
||||
|
||||
Result<net::RDMABuf> BufferPool::Buffer::tryAllocate(uint32_t size) {
|
||||
if (indices_.empty() || current_.size() < size) {
|
||||
if (UNLIKELY(size > pool_->rdmabufSize_)) {
|
||||
return makeError(StorageCode::kBufferSizeExceeded);
|
||||
}
|
||||
if (LIKELY(pool_->semaphore_.try_wait())) {
|
||||
auto index = pool_->allocate();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
} else {
|
||||
return makeError(RPCCode::kRDMANoBuf);
|
||||
}
|
||||
}
|
||||
auto ret = current_.takeFirst(size);
|
||||
assert(ret);
|
||||
alignBuffer(current_);
|
||||
return ret;
|
||||
}
|
||||
|
||||
CoTryTask<net::RDMABuf> BufferPool::Buffer::allocate(uint32_t size) {
|
||||
if (indices_.empty() || current_.size() < size) {
|
||||
if (UNLIKELY(size > pool_->bigRdmabufSize_)) {
|
||||
co_return makeError(StorageCode::kBufferSizeExceeded);
|
||||
} else if (UNLIKELY(size > pool_->rdmabufSize_)) {
|
||||
co_await pool_->bigSemaphore_.co_wait();
|
||||
auto index = pool_->allocateBig();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
} else {
|
||||
co_await pool_->semaphore_.co_wait();
|
||||
auto index = pool_->allocate();
|
||||
indices_.push_back(index);
|
||||
current_ = index.buffer;
|
||||
}
|
||||
}
|
||||
auto ret = current_.takeFirst(size);
|
||||
assert(ret);
|
||||
alignBuffer(current_);
|
||||
co_return ret;
|
||||
}
|
||||
|
||||
void BufferPool::clear(CPUExecutorGroup &executor) {
|
||||
std::vector<folly::coro::TaskWithExecutor<void>> tasks;
|
||||
tasks.reserve(buffers_.size());
|
||||
for (auto &buffer : buffers_) {
|
||||
tasks.push_back(folly::coro::co_invoke([&, buf = std::move(buffer)]() mutable -> CoTask<void> {
|
||||
buf = {};
|
||||
co_return;
|
||||
}).scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
XLOGF(INFO, "deallocate {} RDMA buffers started", buffers_.size());
|
||||
folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
XLOGF(INFO, "deallocate {} RDMA buffers finished", buffers_.size());
|
||||
}
|
||||
|
||||
BufferIndex BufferPool::allocate() {
|
||||
auto guard = freeIndex_.lock();
|
||||
assert(!guard->empty());
|
||||
auto ret = guard->back();
|
||||
guard->pop_back();
|
||||
return ret;
|
||||
}
|
||||
|
||||
BufferIndex BufferPool::allocateBig() {
|
||||
auto guard = bigFreeIndex_.lock();
|
||||
assert(!guard->empty());
|
||||
auto ret = guard->back();
|
||||
guard->pop_back();
|
||||
return ret;
|
||||
}
|
||||
|
||||
void BufferPool::deallocate(const BufferIndex &index) {
|
||||
if (UNLIKELY(index.registerIndex >= bigBufferRegisterIndexStart_)) {
|
||||
bigFreeIndex_.lock()->push_back(index);
|
||||
bigSemaphore_.signal();
|
||||
} else {
|
||||
freeIndex_.lock()->push_back(index);
|
||||
semaphore_.signal();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
92
src/storage/service/BufferPool.h
Normal file
92
src/storage/service/BufferPool.h
Normal file
@@ -0,0 +1,92 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/Synchronized.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <folly/fibers/Semaphore.h>
|
||||
#include <limits>
|
||||
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/CPUExecutorGroup.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/ConstructLog.h"
|
||||
#include "common/utils/Size.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct BufferIndex {
|
||||
uint32_t registerIndex;
|
||||
net::RDMABuf buffer;
|
||||
};
|
||||
|
||||
class BufferPool {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(rdmabuf_size, 4_MB);
|
||||
CONFIG_ITEM(rdmabuf_count, 1024u);
|
||||
CONFIG_ITEM(big_rdmabuf_size, 64_MB);
|
||||
CONFIG_ITEM(big_rdmabuf_count, 64u);
|
||||
};
|
||||
BufferPool(const Config &config)
|
||||
: config_(config),
|
||||
rdmabufSize_(config_.rdmabuf_size()),
|
||||
semaphore_(config_.rdmabuf_count()),
|
||||
bigRdmabufSize_(config_.big_rdmabuf_size()),
|
||||
bigSemaphore_(config_.big_rdmabuf_count()) {}
|
||||
|
||||
Result<Void> init(CPUExecutorGroup &executor);
|
||||
|
||||
auto &iovecs() const { return iovecs_; }
|
||||
|
||||
class Buffer {
|
||||
public:
|
||||
explicit Buffer(BufferPool &pool)
|
||||
: pool_(&pool) {}
|
||||
Buffer(const Buffer &) = delete;
|
||||
Buffer(Buffer &&other) = default;
|
||||
Buffer &operator=(Buffer &&other) = default;
|
||||
~Buffer();
|
||||
|
||||
Result<net::RDMABuf> tryAllocate(uint32_t size);
|
||||
|
||||
CoTryTask<net::RDMABuf> allocate(uint32_t size);
|
||||
|
||||
auto index() const { return indices_.back().registerIndex; }
|
||||
|
||||
private:
|
||||
BufferPool *pool_{};
|
||||
std::vector<BufferIndex> indices_;
|
||||
net::RDMABuf current_;
|
||||
};
|
||||
auto get() { return Buffer{*this}; }
|
||||
|
||||
void clear(CPUExecutorGroup &executor);
|
||||
|
||||
protected:
|
||||
static Result<std::vector<BufferIndex>> initBuffers(CPUExecutorGroup &executor,
|
||||
Size rdmabufSize,
|
||||
uint32_t rdmabufCount,
|
||||
uint32_t limit,
|
||||
std::vector<net::RDMABuf> &outBuffers);
|
||||
|
||||
BufferIndex allocate();
|
||||
|
||||
BufferIndex allocateBig();
|
||||
|
||||
void deallocate(const BufferIndex &index);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::BufferPool"> constructLog_;
|
||||
const Config &config_;
|
||||
Size rdmabufSize_;
|
||||
std::vector<net::RDMABuf> buffers_;
|
||||
std::vector<struct iovec> iovecs_;
|
||||
folly::fibers::Semaphore semaphore_;
|
||||
folly::Synchronized<std::vector<BufferIndex>, std::mutex> freeIndex_;
|
||||
|
||||
Size bigRdmabufSize_;
|
||||
uint32_t bigBufferRegisterIndexStart_ = 0;
|
||||
folly::fibers::Semaphore bigSemaphore_;
|
||||
folly::Synchronized<std::vector<BufferIndex>, std::mutex> bigFreeIndex_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
263
src/storage/service/Components.cc
Normal file
263
src/storage/service/Components.cc
Normal file
@@ -0,0 +1,263 @@
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/LogCommands.h"
|
||||
#include "stubs/common/RealStubFactory.h"
|
||||
#include "stubs/mgmtd/MgmtdServiceStub.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
constexpr std::string_view kRoutingInfoListenerName = "Components";
|
||||
monitor::ValueRecorder targetStateRecorder{"storage.target_state", std::nullopt, false};
|
||||
|
||||
} // namespace
|
||||
|
||||
Components::Components(const Config &config)
|
||||
: config(config),
|
||||
rdmabufPool(config.buffer_pool()),
|
||||
storageTargets(config.targets(), targetMap),
|
||||
aioReadWorker(config.aio_read_worker()),
|
||||
messenger(config.forward_client()),
|
||||
resyncWorker(config.sync_worker(), *this),
|
||||
checkWorker(config.check_worker(), *this),
|
||||
dumpWorker(config.dump_worker(), *this),
|
||||
allocateWorker(config.allocate_worker(), *this),
|
||||
punchHoleWorker(*this),
|
||||
syncMetaKvWorker(config.sync_meta_kv_worker(), *this),
|
||||
reliableForwarding(config.reliable_forwarding(), *this),
|
||||
readPool(config.coroutines_pool_read(), "ReadPool"),
|
||||
updatePool(config.coroutines_pool_update(), "UpdatePool"),
|
||||
syncPool(config.coroutines_pool_default(), "SyncPool"),
|
||||
defaultPool(config.coroutines_pool_default(), "DefaultPool"),
|
||||
storageOperator(config.storage(), *this),
|
||||
reliableUpdate(config.reliable_update(), *this) {}
|
||||
|
||||
Result<Void> Components::start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg) {
|
||||
this->appInfo = appInfo;
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start rdmabufPool", rdmabufPool.init(tpg.procThreadPool()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start readPool", readPool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start updatePool", updatePool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncPool", syncPool.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start defaultPool", defaultPool.start());
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start messenger", messenger.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start reliableForwarding", reliableForwarding.init());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageTargets", storageTargets.load(tpg.procThreadPool()));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
|
||||
"Start aioReadWorker",
|
||||
aioReadWorker.start(storageTargets.fds(), rdmabufPool.iovecs()));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start dumpWorker", dumpWorker.start(appInfo.nodeId));
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start allocateWorker", allocateWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start punchHoleWorker", punchHoleWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncMetaKvWorker", syncMetaKvWorker.start());
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start waitRoutingInfo", waitRoutingInfo(appInfo, tpg.bgThreadPool().randomPick()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start resyncWorker", resyncWorker.start());
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
|
||||
"Start checkWorker",
|
||||
checkWorker.start(storageTargets.targetPaths(), storageTargets.manufacturers()));
|
||||
|
||||
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageOperator", storageOperator.init(storageTargets.targetPaths().size()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor) {
|
||||
// 1. init mgdtd client.
|
||||
if (!netClient) {
|
||||
netClient = std::make_unique<net::Client>(config.client());
|
||||
RETURN_AND_LOG_ON_ERROR(netClient->start());
|
||||
}
|
||||
if (mgmtdClient.load() == nullptr) {
|
||||
auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(
|
||||
hf3fs::stubs::ClientContextCreator{[&](net::Address addr) { return netClient->serdeCtx(addr); }});
|
||||
mgmtdClient = std::make_shared<hf3fs::client::MgmtdClientForServer>(appInfo.clusterId,
|
||||
std::move(stubFactory),
|
||||
config.mgmtd());
|
||||
}
|
||||
mgmtdClient.load()->setAppInfoForHeartbeat(appInfo);
|
||||
mgmtdClient.load()->setConfigListener(ApplicationBase::updateConfig);
|
||||
|
||||
// 2. wait target offline.
|
||||
auto currentMap = targetMap.snapshot();
|
||||
updateHeartbeatPayload(*currentMap, true);
|
||||
folly::coro::blockingWait(mgmtdClient.load()->start(&executor));
|
||||
for (auto sleep = 0;; ++sleep) {
|
||||
if (sleep) {
|
||||
XLOGF(WARNING, "Waiting for target offline in routing info...");
|
||||
std::this_thread::sleep_for(1000_ms);
|
||||
}
|
||||
|
||||
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
|
||||
auto copy = currentMap->clone();
|
||||
auto refreshResult = folly::coro::blockingWait(mgmtdClient.load()->refreshRoutingInfo(false));
|
||||
if (UNLIKELY(!refreshResult)) {
|
||||
XLOGF(ERR, "refresh routing info error {}", refreshResult.error());
|
||||
continue;
|
||||
}
|
||||
auto result = copy->updateRouting(mgmtdClient.load()->getRoutingInfo(), false);
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "get and parse routing info error {}", result.error());
|
||||
continue;
|
||||
}
|
||||
bool needWaiting = false;
|
||||
for (auto &[targetId, target] : copy->getTargets()) {
|
||||
if (target.publicState == flat::PublicTargetState::SERVING ||
|
||||
target.publicState == flat::PublicTargetState::SYNCING ||
|
||||
target.publicState == flat::PublicTargetState::WAITING) {
|
||||
XLOGF(WARNING, "waiting for chain {} target {}", targetId, serde::toJsonString(target));
|
||||
needWaiting = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!needWaiting) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 3. set listener.
|
||||
targetMap.setUpdateCallback([this](const TargetMap &map) { updateHeartbeatPayload(map); });
|
||||
RETURN_AND_LOG_ON_ERROR(refreshRoutingInfo());
|
||||
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
|
||||
XLOGF(INFO, "Initial target map: {}", serde::toJsonString(targetMap.snapshot()->getTargets()));
|
||||
bool succ = mgmtdClient.load()->addRoutingInfoListener(std::string{kRoutingInfoListenerName},
|
||||
[this](auto) { refreshRoutingInfo(); });
|
||||
if (UNLIKELY(!succ)) {
|
||||
auto msg = fmt::format("node {} addRoutingInfoListener failed!", appInfo.nodeId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::refreshRoutingInfo() { return targetMap.updateRouting(mgmtdClient.load()->getRoutingInfo()); }
|
||||
|
||||
Result<Void> Components::stopAndJoin(CPUExecutorGroup &executor) {
|
||||
LOG_COMMAND(INFO, "Stop aioReadWorker", aioReadWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop syncMetaKvWorker", syncMetaKvWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop punchHoleWorker", punchHoleWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop allocateWorker", allocateWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop dumpWorker", dumpWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop checkWorker", checkWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop resyncWorker", resyncWorker.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop storageOperator", storageOperator.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop reliableForwarding", reliableForwarding.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop messenger", messenger.stopAndJoin());
|
||||
targetMap.setUpdateCallback([](auto) {});
|
||||
XLOGF(INFO, "Send offline state");
|
||||
if (auto mgmtd = mgmtdClient.load()) {
|
||||
mgmtd->removeRoutingInfoListener(kRoutingInfoListenerName);
|
||||
updateHeartbeatPayload(*targetMap.snapshot(), true);
|
||||
folly::coro::blockingWait(mgmtd->heartbeat());
|
||||
}
|
||||
LOG_COMMAND(INFO, "Stop routingStore", stopMgmtdClient());
|
||||
|
||||
LOG_COMMAND(INFO, "Stop readPool", readPool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop updatePool", updatePool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop syncPool", syncPool.stopAndJoin());
|
||||
LOG_COMMAND(INFO, "Stop defaultPool", defaultPool.stopAndJoin());
|
||||
|
||||
auto snapshot = targetMap.release();
|
||||
std::vector<std::shared_ptr<StorageTarget>> targets;
|
||||
for (auto &[targetId, target] : snapshot->getTargets()) {
|
||||
if (target.storageTarget != nullptr) {
|
||||
targets.push_back(target.storageTarget);
|
||||
}
|
||||
}
|
||||
LOG_COMMAND(INFO, "Reset target map", snapshot.reset());
|
||||
|
||||
XLOGF(WARNING, "start to release {} targets", targets.size());
|
||||
std::atomic<uint32_t> released{};
|
||||
std::atomic<uint32_t> synced{};
|
||||
for (auto &target : targets) {
|
||||
executor.randomPick().add([&, t = std::move(target)]() mutable {
|
||||
auto result = t->release();
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", t->path(), result.error());
|
||||
} else {
|
||||
++synced;
|
||||
}
|
||||
t = nullptr;
|
||||
++released;
|
||||
});
|
||||
}
|
||||
|
||||
for (int i = 0; released != targets.size(); ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for release targets finished...");
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
|
||||
XLOGF(WARNING, "released {} targets, synced {} targets", released.load(), synced.load());
|
||||
|
||||
LOG_COMMAND(INFO, "Clear storageTargets", storageTargets.globalFileStore().clear(executor));
|
||||
LOG_COMMAND(INFO, "Clear rdmabufPool", rdmabufPool.clear(executor));
|
||||
if (config.speed_up_quit()) {
|
||||
for (auto &engine : storageTargets.engines()) {
|
||||
engine->speed_up_quit();
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> Components::stopMgmtdClient() {
|
||||
if (mgmtdClient.load()) {
|
||||
folly::coro::blockingWait(mgmtdClient.load()->stop());
|
||||
}
|
||||
mgmtdClient.store(nullptr);
|
||||
if (netClient) {
|
||||
netClient->stopAndJoin();
|
||||
}
|
||||
netClient.reset();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<robin_hood::unordered_set<std::string>> Components::getActiveClientsList() {
|
||||
auto result = folly::coro::blockingWait(mgmtdClient.load()->listClientSessions());
|
||||
RETURN_AND_LOG_ON_ERROR(result);
|
||||
if (result->bootstrapping) {
|
||||
auto msg = fmt::format("mgmtd is bootstrapping, skip");
|
||||
XLOG(WARNING, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
robin_hood::unordered_set<std::string> activeClients;
|
||||
for (auto &client : result->sessions) {
|
||||
activeClients.emplace(std::move(client.clientId));
|
||||
}
|
||||
return Result<robin_hood::unordered_set<std::string>>(std::move(activeClients));
|
||||
}
|
||||
|
||||
void Components::triggerHeartbeatIfNeed() {
|
||||
if (triggerHeartbeatFlag.exchange(0)) {
|
||||
mgmtdClient.load()->triggerHeartbeat();
|
||||
}
|
||||
}
|
||||
|
||||
void Components::updateHeartbeatPayload(const TargetMap &targetMap, bool offline /* = false */) {
|
||||
flat::StorageHeartbeatInfo heartbeat;
|
||||
for (auto &[targetId, target] : targetMap.getTargets()) {
|
||||
flat::LocalTargetInfo targetInfo;
|
||||
targetInfo.targetId = targetId;
|
||||
targetInfo.localState = offline ? flat::LocalTargetState::OFFLINE : target.localState;
|
||||
targetInfo.diskIndex = target.diskIndex;
|
||||
targetInfo.lowSpace = target.lowSpace;
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", fmt::format("{}", targetId));
|
||||
targetStateRecorder.set(uint32_t(target.localState), tag);
|
||||
if (targetInfo.localState != flat::LocalTargetState::OFFLINE) {
|
||||
targetInfo.usedSize = target.storageTarget->usedSize();
|
||||
targetInfo.chainVersion = target.vChainId.chainVer;
|
||||
}
|
||||
heartbeat.targets.push_back(targetInfo);
|
||||
}
|
||||
mgmtdClient.load()->updateHeartbeatPayload(heartbeat);
|
||||
++triggerHeartbeatFlag;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
124
src/storage/service/Components.h
Normal file
124
src/storage/service/Components.h
Normal file
@@ -0,0 +1,124 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/concurrency/AtomicSharedPtr.h>
|
||||
|
||||
#include "client/mgmtd/MgmtdClientForServer.h"
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/DynamicCoroutinesPool.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "fbs/storage/Service.h"
|
||||
#include "storage/aio/AioReadWorker.h"
|
||||
#include "storage/service/BufferPool.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
#include "storage/sync/ResyncWorker.h"
|
||||
#include "storage/worker/AllocateWorker.h"
|
||||
#include "storage/worker/CheckWorker.h"
|
||||
#include "storage/worker/DumpWorker.h"
|
||||
#include "storage/worker/PunchHoleWorker.h"
|
||||
#include "storage/worker/SyncMetaKvWorker.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ReliableForwarding;
|
||||
|
||||
struct Components {
|
||||
struct Config : public ConfigBase<Config> {
|
||||
CONFIG_OBJ(base, net::Server::Config, [](net::Server::Config &c) {
|
||||
c.set_groups_length(2);
|
||||
c.groups(0).listener().set_listen_port(8000);
|
||||
c.groups(0).set_network_type(net::Address::RDMA);
|
||||
c.groups(0).set_services({"StorageSerde"});
|
||||
|
||||
c.groups(1).set_network_type(net::Address::TCP);
|
||||
c.groups(1).listener().set_listen_port(9000);
|
||||
c.groups(1).set_use_independent_thread_pool(true);
|
||||
c.groups(1).set_services({"Core"});
|
||||
|
||||
c.thread_pool().set_num_io_threads(32);
|
||||
c.thread_pool().set_num_proc_threads(32);
|
||||
});
|
||||
|
||||
CONFIG_OBJ(client, net::Client::Config);
|
||||
CONFIG_OBJ(mgmtd, hf3fs::client::MgmtdClientForServer::Config);
|
||||
CONFIG_OBJ(targets, StorageTargets::Config);
|
||||
CONFIG_OBJ(storage, StorageOperator::Config);
|
||||
CONFIG_OBJ(reliable_forwarding, ReliableForwarding::Config);
|
||||
CONFIG_OBJ(reliable_update, ReliableUpdate::Config);
|
||||
CONFIG_OBJ(buffer_pool, BufferPool::Config);
|
||||
CONFIG_OBJ(aio_read_worker, AioReadWorker::Config);
|
||||
CONFIG_OBJ(sync_worker, ResyncWorker::Config);
|
||||
CONFIG_OBJ(check_worker, CheckWorker::Config);
|
||||
CONFIG_OBJ(dump_worker, DumpWorker::Config);
|
||||
CONFIG_OBJ(allocate_worker, AllocateWorker::Config);
|
||||
CONFIG_OBJ(sync_meta_kv_worker, SyncMetaKvWorker::Config);
|
||||
CONFIG_OBJ(forward_client, net::Client::Config);
|
||||
CONFIG_OBJ(coroutines_pool_read, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_update, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_sync, DynamicCoroutinesPool::Config);
|
||||
CONFIG_OBJ(coroutines_pool_default, DynamicCoroutinesPool::Config);
|
||||
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_read, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_update, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(speed_up_quit, true);
|
||||
};
|
||||
|
||||
Components(const Config &config);
|
||||
|
||||
Result<Void> start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg);
|
||||
Result<Void> waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor);
|
||||
Result<Void> refreshRoutingInfo();
|
||||
Result<Void> stopAndJoin(CPUExecutorGroup &executor);
|
||||
Result<Void> stopMgmtdClient();
|
||||
const flat::AppInfo &getAppInfo() const { return appInfo; }
|
||||
|
||||
Result<robin_hood::unordered_set<std::string>> getActiveClientsList();
|
||||
void triggerHeartbeatIfNeed();
|
||||
|
||||
inline DynamicCoroutinesPool &getCoroutinesPool(uint16_t methodId) {
|
||||
if (LIKELY(config.use_coroutines_pool_read()) && methodId == StorageSerde<>::batchReadMethodId) {
|
||||
return readPool;
|
||||
}
|
||||
if (LIKELY(config.use_coroutines_pool_update()) &&
|
||||
(methodId == StorageSerde<>::writeMethodId || methodId == StorageSerde<>::updateMethodId)) {
|
||||
return updatePool;
|
||||
}
|
||||
if (methodId == StorageSerde<>::syncStartMethodId || methodId == StorageSerde<>::getAllChunkMetadataMethodId) {
|
||||
return syncPool;
|
||||
}
|
||||
return defaultPool;
|
||||
}
|
||||
|
||||
protected:
|
||||
void updateHeartbeatPayload(const TargetMap &map, bool offline = false);
|
||||
|
||||
public:
|
||||
ConstructLog<"storage::Components"> constructLog_;
|
||||
const Config &config;
|
||||
flat::AppInfo appInfo;
|
||||
std::unique_ptr<net::Client> netClient;
|
||||
folly::atomic_shared_ptr<hf3fs::client::IMgmtdClientForServer> mgmtdClient;
|
||||
BufferPool rdmabufPool;
|
||||
AtomicallyTargetMap targetMap;
|
||||
StorageTargets storageTargets;
|
||||
AioReadWorker aioReadWorker;
|
||||
client::StorageMessenger messenger;
|
||||
ResyncWorker resyncWorker;
|
||||
CheckWorker checkWorker;
|
||||
DumpWorker dumpWorker;
|
||||
AllocateWorker allocateWorker;
|
||||
PunchHoleWorker punchHoleWorker;
|
||||
SyncMetaKvWorker syncMetaKvWorker;
|
||||
ReliableForwarding reliableForwarding;
|
||||
DynamicCoroutinesPool readPool;
|
||||
DynamicCoroutinesPool updatePool;
|
||||
DynamicCoroutinesPool syncPool;
|
||||
DynamicCoroutinesPool defaultPool;
|
||||
StorageOperator storageOperator;
|
||||
ReliableUpdate reliableUpdate;
|
||||
std::atomic<uint32_t> triggerHeartbeatFlag{};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
281
src/storage/service/ReliableForwarding.cc
Normal file
281
src/storage/service/ReliableForwarding.cc
Normal file
@@ -0,0 +1,281 @@
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
|
||||
#include <folly/experimental/coro/Sleep.h>
|
||||
|
||||
#include "common/app/ApplicationBase.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/ExponentialBackoffRetry.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder reliableForwardRecorder("storage.reliable_forward");
|
||||
monitor::OperationRecorder syncingReadRecorder("storage.syncing_read");
|
||||
monitor::OperationRecorder updateRemoteRecorder("storage.update_remote");
|
||||
|
||||
monitor::CountRecorder forwardWriteBytes("storage.forward.write_bytes");
|
||||
monitor::DistributionRecorder forwardWriteDist("storage.forward.write_dist");
|
||||
monitor::CountRecorder forwardSyncingBytes("storage.forward.syncing_bytes");
|
||||
monitor::DistributionRecorder forwardSyncingDist("storage.forward.syncing_dist");
|
||||
|
||||
} // namespace
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
Result<Void> ReliableForwarding::init() { return Void{}; }
|
||||
|
||||
Result<Void> ReliableForwarding::stopAndJoin() { return Void{}; }
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::forwardWithRetry(ServiceRequestContext &requestCtx,
|
||||
const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
bool allowOutdatedChainVer /* = true */) {
|
||||
auto startTime = RelativeTime::now();
|
||||
|
||||
auto recordGuard = reliableForwardRecorder.record();
|
||||
IOResult ioResult;
|
||||
|
||||
ExponentialBackoffRetry retry(config_.retry_first_wait().asMs(),
|
||||
config_.retry_max_wait().asMs(),
|
||||
config_.retry_total_time().asMs());
|
||||
for (uint32_t retryCount = 0; !stopped_; ++retryCount) {
|
||||
auto waitTime = retry.getWaitTime();
|
||||
|
||||
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
|
||||
CO_RETURN_ON_ERROR(targetResult);
|
||||
target = std::move(*targetResult);
|
||||
|
||||
auto ioResult = co_await forward(req, retryCount, rdmabuf, chunkEngineJob, target, commitIO, waitTime);
|
||||
if (LIKELY(bool(ioResult.lengthInfo))) {
|
||||
recordGuard.succ();
|
||||
co_return ioResult;
|
||||
} else if (ioResult.lengthInfo.error().code() == StorageCode::kNoSuccessorTarget) {
|
||||
recordGuard.succ();
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
// TODO(SF): fine-grained error handling.
|
||||
auto code = ioResult.lengthInfo.error().code();
|
||||
if (!allowOutdatedChainVer && code == StorageClientCode::kRoutingVersionMismatch) {
|
||||
XLOGF(ERR,
|
||||
"forwarding routing version mismatch, req {}, result {}, elapsed {}",
|
||||
req,
|
||||
ioResult,
|
||||
(RelativeTime::now() - startTime).asMs());
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
if (waitTime.count() == 0) {
|
||||
XLOGF_IF(DFATAL,
|
||||
!requestCtx.debugFlags.faultInjectionEnabled(),
|
||||
"forwarding timeout with error, req {}, result {}",
|
||||
req,
|
||||
ioResult);
|
||||
co_return ioResult;
|
||||
} else if (code != RPCCode::kTimeout) {
|
||||
XLOGF(WARNING,
|
||||
"forwarding wait and retry, req {}, error {}, elapsed {}",
|
||||
req,
|
||||
ioResult,
|
||||
(RelativeTime::now() - startTime).asMs());
|
||||
constexpr auto checkInterval = 100ms;
|
||||
for (auto elapsed = 0ms; elapsed < waitTime && !stopped_; elapsed += checkInterval) {
|
||||
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
|
||||
CO_RETURN_ON_ERROR(targetResult);
|
||||
target = std::move(*targetResult);
|
||||
if (!target->successor.has_value()) {
|
||||
break;
|
||||
}
|
||||
co_await folly::coro::sleep(std::min(checkInterval, waitTime - elapsed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto msg = fmt::format("req is refused because of stopping, req {}", req);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
|
||||
}
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::forward(const UpdateReq &req,
|
||||
uint32_t retryCount,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
std::chrono::milliseconds timeout) {
|
||||
if (!target->successor.has_value()) {
|
||||
// use the latest chain version.
|
||||
commitIO.commitChainVer = target->vChainId.chainVer;
|
||||
co_return makeError(StorageCode::kNoSuccessorTarget);
|
||||
}
|
||||
|
||||
auto ioResult = co_await doForward(req, rdmabuf, chunkEngineJob, retryCount, *target, commitIO.isSyncing, timeout);
|
||||
if (ioResult.lengthInfo) {
|
||||
commitIO.commitVer = ioResult.commitVer;
|
||||
// use successor's chain version.
|
||||
commitIO.commitChainVer = ioResult.commitChainVer;
|
||||
|
||||
if (ioResult.commitChainVer > target->vChainId.chainVer) {
|
||||
// the remote obtains a higher chain version, and the local need to obtain the latest version by retry.
|
||||
auto msg = fmt::format("the remote obtains a higher chain version {} > current {}, req {}",
|
||||
ioResult.commitChainVer,
|
||||
target->vChainId.chainVer,
|
||||
req);
|
||||
XLOGF(WARNING, "{}", msg);
|
||||
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
}
|
||||
co_return ioResult;
|
||||
}
|
||||
|
||||
CoTask<IOResult> ReliableForwarding::doForward(const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t retryCount,
|
||||
const Target &target,
|
||||
bool &isSyncing,
|
||||
std::chrono::milliseconds timeout) {
|
||||
UpdateReq updateReq = req;
|
||||
updateReq.options.fromClient = false;
|
||||
updateReq.retryCount = retryCount;
|
||||
updateReq.payload.rdmabuf = rdmabuf;
|
||||
updateReq.payload.key.vChainId.chainVer = target.vChainId.chainVer;
|
||||
|
||||
auto buffer = components_.rdmabufPool.get();
|
||||
isSyncing = target.successor->targetInfo.publicState == hf3fs::flat::PublicTargetState::SYNCING;
|
||||
if (isSyncing) {
|
||||
updateReq.options.isSyncing = true;
|
||||
updateReq.options.commitChainVer = target.vChainId.chainVer;
|
||||
}
|
||||
|
||||
bool readForSyncing = req.payload.isWriteTruncateExtend() && isSyncing &&
|
||||
(req.options.isSyncing || req.payload.length != req.payload.chunkSize);
|
||||
if (readForSyncing) {
|
||||
auto recordGuard = syncingReadRecorder.record();
|
||||
|
||||
// read the entire chunk.
|
||||
IOResult readResult;
|
||||
auto allocateResult = buffer.tryAllocate(req.payload.chunkSize);
|
||||
if (UNLIKELY(!allocateResult)) {
|
||||
allocateResult = co_await buffer.allocate(req.payload.chunkSize);
|
||||
}
|
||||
if (UNLIKELY(!allocateResult)) {
|
||||
readResult.lengthInfo = makeError(std::move(allocateResult.error()));
|
||||
co_return readResult;
|
||||
}
|
||||
auto &readBuf = *allocateResult;
|
||||
|
||||
ReadIO payload;
|
||||
payload.key = updateReq.payload.key;
|
||||
payload.offset = 0;
|
||||
payload.length = req.payload.chunkSize;
|
||||
BatchReadJob batch(payload, target.storageTarget.get(), readResult, req.payload.checksum.type);
|
||||
batch.setRecalculateChecksum();
|
||||
batch.front().state().localbuf = readBuf;
|
||||
batch.front().state().bufferIndex = buffer.index();
|
||||
batch.front().state().readUncommitted = true;
|
||||
if (chunkEngineJob.chunk()) {
|
||||
batch.front().state().chunkEngineJob.set(nullptr, chunkEngineJob.chunk()->raw_chunk());
|
||||
}
|
||||
|
||||
co_await components_.aioReadWorker.enqueue(&batch);
|
||||
co_await batch.complete();
|
||||
CO_RETURN_ON_ERROR(readResult.lengthInfo); // OK.
|
||||
|
||||
// clear the inline data if the update is built from full chunk read
|
||||
if (BITFLAGS_CONTAIN(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE)) {
|
||||
BITFLAGS_CLEAR(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE);
|
||||
updateReq.payload.inlinebuf.data.clear();
|
||||
}
|
||||
|
||||
auto length = *readResult.lengthInfo;
|
||||
updateReq.payload.updateVer = readResult.updateVer;
|
||||
if (req.options.isSyncing) {
|
||||
updateReq.options.commitChainVer = batch.front().result().commitChainVer;
|
||||
}
|
||||
updateReq.payload.offset = 0;
|
||||
updateReq.payload.length = length;
|
||||
updateReq.payload.rdmabuf = readBuf.first(length).toRemoteBuf();
|
||||
updateReq.payload.checksum = batch.front().state().chunkChecksum;
|
||||
updateReq.payload.updateType = UpdateType::WRITE;
|
||||
|
||||
if (length <= config_.max_inline_forward_bytes()) {
|
||||
updateReq.payload.inlinebuf.data.assign(readBuf.ptr(), readBuf.ptr() + length);
|
||||
BITFLAGS_SET(updateReq.featureFlags, hf3fs::storage::FeatureFlags::SEND_DATA_INLINE);
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
} else if (isSyncing && !req.payload.isRemove() && chunkEngineJob.chunk() == nullptr) {
|
||||
auto chunkResult = target.storageTarget->queryChunk(req.payload.key.chunkId);
|
||||
if (UNLIKELY(!chunkResult)) {
|
||||
XLOGF(ERR, "forward query chunk failed, req {}, error {}", updateReq, chunkResult.error());
|
||||
co_return makeError(std::move(chunkResult.error()));
|
||||
}
|
||||
updateReq.payload.updateVer = chunkResult->updateVer;
|
||||
}
|
||||
|
||||
auto recordGuard = updateRemoteRecorder.record();
|
||||
auto addrResult = target.getSuccessorAddr();
|
||||
if (UNLIKELY(!addrResult)) {
|
||||
XLOGF(ERR, "target forward addr invalid, target {}", target);
|
||||
co_return makeError(std::move(addrResult.error()));
|
||||
}
|
||||
net::UserRequestOptions reqOptions;
|
||||
reqOptions.timeout = Duration{timeout};
|
||||
auto updateResult = co_await components_.messenger.update(*addrResult, updateReq, &reqOptions);
|
||||
if (UNLIKELY(!updateResult)) {
|
||||
XLOGF(ERR, "forward timeout, req {}, result {}", updateReq, updateResult);
|
||||
co_return makeError(std::move(updateResult.error()));
|
||||
}
|
||||
if (LIKELY(bool(updateResult->result.lengthInfo))) {
|
||||
if (target.vChainId.chainVer < updateResult->result.commitChainVer) {
|
||||
auto msg = fmt::format("chain version local < remote, req {} local {} remote {}",
|
||||
updateReq,
|
||||
target,
|
||||
updateResult->result);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
|
||||
auto length = *updateResult->result.lengthInfo;
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", fmt::format("{}", target.targetId));
|
||||
if (isSyncing) {
|
||||
updateResult->result.updateVer = req.payload.updateVer;
|
||||
forwardSyncingBytes.addSample(length, tag);
|
||||
forwardSyncingDist.addSample(length, tag);
|
||||
} else {
|
||||
forwardWriteBytes.addSample(length, tag);
|
||||
forwardWriteDist.addSample(length, tag);
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
} else {
|
||||
XLOGF(ERR, "forward failed, req {}, result {}", updateReq, updateResult->result);
|
||||
auto errorCode = updateResult->result.lengthInfo.error().code();
|
||||
if (errorCode == StorageCode::kChecksumMismatch) {
|
||||
auto reqChecksum = updateReq.payload.checksum;
|
||||
auto realChecksum = ChecksumInfo::create(reqChecksum.type,
|
||||
(const uint8_t *)updateReq.payload.rdmabuf.addr(),
|
||||
updateReq.payload.length);
|
||||
if (reqChecksum != realChecksum) {
|
||||
XLOGF(DFATAL,
|
||||
"local rdma buffer is corrupted local {} != client {}, req: {}, kill self...",
|
||||
realChecksum,
|
||||
reqChecksum,
|
||||
req);
|
||||
ApplicationBase::handleSignal(SIGUSR2);
|
||||
}
|
||||
}
|
||||
}
|
||||
co_return updateResult->result;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
65
src/storage/service/ReliableForwarding.h
Normal file
65
src/storage/service/ReliableForwarding.h
Normal file
@@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/net/Client.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
struct Target;
|
||||
|
||||
class ReliableForwarding {
|
||||
public:
|
||||
struct Config : ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_first_wait, 100_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_max_wait, 1000_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(retry_total_time, 60_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_inline_forward_bytes, Size{});
|
||||
};
|
||||
|
||||
ReliableForwarding(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components) {}
|
||||
|
||||
Result<Void> init();
|
||||
|
||||
void beforeStop() { stopped_ = true; }
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
CoTask<IOResult> forwardWithRetry(ServiceRequestContext &requestCtx,
|
||||
const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
bool allowOutdatedChainVer = true);
|
||||
|
||||
CoTask<IOResult> forward(const UpdateReq &req,
|
||||
uint32_t retryCount,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
TargetPtr &target,
|
||||
CommitIO &commitIO,
|
||||
std::chrono::milliseconds timeout);
|
||||
|
||||
CoTask<IOResult> doForward(const UpdateReq &req,
|
||||
const net::RDMARemoteBuf &rdmabuf,
|
||||
const ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t retryCount,
|
||||
const Target &target,
|
||||
bool &isSyncing,
|
||||
std::chrono::milliseconds timeout);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::ReliableForwarding"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
158
src/storage/service/ReliableUpdate.cc
Normal file
158
src/storage/service/ReliableUpdate.cc
Normal file
@@ -0,0 +1,158 @@
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::OperationRecorder reliableUpdateRecorder{"storage.reliable_update"};
|
||||
monitor::CountRecorder reliableUpdateWaited{"storage.reliable_update.waited"};
|
||||
monitor::CountRecorder reliableUpdateDuplidate{"storage.reliable_update.duplicate"};
|
||||
monitor::CountRecorder reliableUpdateCached{"storage.reliable_update.cached"};
|
||||
monitor::OperationRecorder waitChannelLockRecorder{"storage.wait_channel_lock"};
|
||||
|
||||
CoTask<IOResult> ReliableUpdate::update(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target) {
|
||||
XLOGF(DBG1, "Start reliable update, tag: {}, req: {}", req.tag, req);
|
||||
|
||||
if (UNLIKELY(stopped_)) {
|
||||
auto msg = fmt::format("req is refused because of stopping, req {}", req);
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
|
||||
}
|
||||
|
||||
// 1. check if channel id is valid.
|
||||
if (req.tag.channel.id == ChannelId{0}) {
|
||||
XLOGF(DFATAL,
|
||||
"{} request has invalid message tag {}: {}",
|
||||
magic_enum::enum_name(req.payload.updateType),
|
||||
req.tag,
|
||||
req);
|
||||
co_return makeError(StorageClientCode::kFoundBug);
|
||||
}
|
||||
|
||||
// 2. get cached.
|
||||
auto clientId = req.tag.clientId;
|
||||
auto reqResult = shards_.withLock(
|
||||
[&](ClientMap &map) {
|
||||
auto &clientStatus = map[clientId];
|
||||
if (clientStatus == nullptr) {
|
||||
clientStatus = std::make_shared<ClientStatus>();
|
||||
}
|
||||
auto key = std::pair<ChainId, ChannelId>(req.payload.key.vChainId.chainId, req.tag.channel.id);
|
||||
auto &reqResult = clientStatus->channelMap[key];
|
||||
clientStatus->lastUsedTime = UtcClock::now();
|
||||
return std::shared_ptr<ReqResult>(clientStatus, &reqResult);
|
||||
},
|
||||
clientId);
|
||||
|
||||
// 3. lock channel.
|
||||
auto lockRecordGuard = waitChannelLockRecorder.record();
|
||||
folly::coro::Baton baton;
|
||||
auto lock = target->storageTarget->tryLockChannel(baton, fmt::format("{}:{}", clientId, req.tag.channel.id));
|
||||
if (!lock.locked()) {
|
||||
reliableUpdateWaited.addSample(1);
|
||||
XLOGF(ERR, "Channel is locked, need retry, tag: {}, req: {}", req.tag, req);
|
||||
co_return makeError(StorageCode::kChannelIsLocked);
|
||||
}
|
||||
lockRecordGuard.report(true);
|
||||
|
||||
IOResult updateResult;
|
||||
if (req.tag.channel.seqnum < reqResult->channelSeqnum) {
|
||||
reliableUpdateDuplidate.addSample(1);
|
||||
XLOGF(WARN, "Find a duplicate update, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
co_return makeError(StorageClientCode::kDuplicateUpdate);
|
||||
}
|
||||
|
||||
// 4. return cached result.
|
||||
if (req.tag.channel.seqnum == reqResult->channelSeqnum &&
|
||||
target->storageTarget->generationId() == reqResult->generationId) {
|
||||
if (req.tag.requestId != reqResult->requestId) {
|
||||
XLOGF(DFATAL,
|
||||
"[BUG] Message tag {} is already assigned to another update, cached result: {}, req: {}",
|
||||
req.tag,
|
||||
*reqResult,
|
||||
req);
|
||||
co_return makeError(StorageClientCode::kFoundBug);
|
||||
}
|
||||
|
||||
if (reqResult->updateResult.lengthInfo.hasValue()) {
|
||||
if (req.payload.updateVer == 0 || req.payload.updateVer == reqResult->updateResult.updateVer) {
|
||||
updateResult = reqResult->updateResult;
|
||||
|
||||
if (*updateResult.lengthInfo != req.payload.length && !req.payload.isExtend()) {
|
||||
updateResult.lengthInfo = req.payload.length;
|
||||
XLOGF(WARN,
|
||||
"Cached length info {} not equal to write size in request {}, fixed update result: {}",
|
||||
reqResult->updateResult.lengthInfo,
|
||||
req,
|
||||
updateResult);
|
||||
}
|
||||
|
||||
reliableUpdateCached.addSample(1);
|
||||
XLOGF(DBG1, "Return cached update result, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
co_return updateResult;
|
||||
} else {
|
||||
XLOGF(CRITICAL,
|
||||
"Cached update version not equal to request update version, req:{}, cached result: {}",
|
||||
req,
|
||||
*reqResult);
|
||||
}
|
||||
} else if (req.payload.updateVer == 0 && !target->storageTarget->useChunkEngine() &&
|
||||
reqResult->succUpdateVer != 0) {
|
||||
XLOGF(CRITICAL, "Pick up previous update version, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
|
||||
req.payload.updateVer = reqResult->succUpdateVer;
|
||||
}
|
||||
}
|
||||
|
||||
// 5. start a new task.
|
||||
auto recordGuard = reliableUpdateRecorder.record();
|
||||
updateResult = co_await components_.storageOperator.handleUpdate(requestCtx, req, ibSocket, target);
|
||||
if (LIKELY(bool(updateResult.lengthInfo))) {
|
||||
recordGuard.succ();
|
||||
}
|
||||
|
||||
*reqResult = {req.tag.channel.seqnum,
|
||||
req.tag.requestId,
|
||||
updateResult,
|
||||
req.payload.updateVer,
|
||||
target->storageTarget->generationId()};
|
||||
|
||||
XLOGF(DBG1, "Completed reliable update, tag: {}, result: {}", req.tag, *reqResult);
|
||||
co_return updateResult;
|
||||
}
|
||||
|
||||
Result<Void> ReliableUpdate::cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients) {
|
||||
if (!config_.clean_up_expired_clients()) {
|
||||
return Void{};
|
||||
}
|
||||
if (activeClients.empty()) {
|
||||
XLOGF(ERR, "activeClients is empty!");
|
||||
return Void{};
|
||||
}
|
||||
auto allZero = ClientId::zero();
|
||||
std::size_t cleanUpClientCount = 0;
|
||||
shards_.iterate([&](ClientMap &map) {
|
||||
auto now = UtcClock::now();
|
||||
auto expiredClientsTimeout = config_.expired_clients_timeout();
|
||||
for (auto it = map.begin(); it != map.end();) {
|
||||
const auto &[clientId, clientStatus] = *it;
|
||||
if (!activeClients.contains(clientId.uuid.toHexString()) && clientId != allZero &&
|
||||
now >= clientStatus->lastUsedTime + expiredClientsTimeout) {
|
||||
XLOGF(WARNING, "clean up expired client {}, last used time: {}", clientId, clientStatus->lastUsedTime);
|
||||
it = map.erase(it);
|
||||
++cleanUpClientCount;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
});
|
||||
XLOGF(WARNING, "clean up {} expired clients", cleanUpClientCount);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
61
src/storage/service/ReliableUpdate.h
Normal file
61
src/storage/service/ReliableUpdate.h
Normal file
@@ -0,0 +1,61 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/net/Transport.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "common/utils/Shards.h"
|
||||
#include "common/utils/Size.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
class StorageOperator;
|
||||
|
||||
class ReliableUpdate {
|
||||
public:
|
||||
struct Config : ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(clean_up_expired_clients, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(expired_clients_timeout, 1_h);
|
||||
};
|
||||
ReliableUpdate(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components) {}
|
||||
|
||||
CoTask<IOResult> update(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target);
|
||||
|
||||
Result<Void> cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients);
|
||||
|
||||
void beforeStop() { stopped_ = true; }
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::ReliableUpdate"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
folly::coro::Mutex mutex_;
|
||||
|
||||
struct ReqResult {
|
||||
SERDE_STRUCT_FIELD(channelSeqnum, ChannelSeqNum{0});
|
||||
SERDE_STRUCT_FIELD(requestId, RequestId{0});
|
||||
SERDE_STRUCT_FIELD(updateResult, IOResult{});
|
||||
SERDE_STRUCT_FIELD(succUpdateVer, ChunkVer{});
|
||||
SERDE_STRUCT_FIELD(generationId, uint32_t{});
|
||||
};
|
||||
|
||||
struct ClientStatus {
|
||||
std::unordered_map<std::pair<ChainId, ChannelId>, ReqResult> channelMap;
|
||||
UtcTime lastUsedTime;
|
||||
};
|
||||
using ClientMap = std::unordered_map<ClientId, std::shared_ptr<ClientStatus>>;
|
||||
Shards<ClientMap, 1024> shards_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
1203
src/storage/service/StorageOperator.cc
Normal file
1203
src/storage/service/StorageOperator.cc
Normal file
File diff suppressed because it is too large
Load Diff
161
src/storage/service/StorageOperator.h
Normal file
161
src/storage/service/StorageOperator.h
Normal file
@@ -0,0 +1,161 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/concurrency/ConcurrentHashMap.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
|
||||
#include "analytics/StructuredTraceLog.h"
|
||||
#include "client/mgmtd/IMgmtdClientForServer.h"
|
||||
#include "client/mgmtd/RoutingInfo.h"
|
||||
#include "client/storage/StorageMessenger.h"
|
||||
#include "common/net/Server.h"
|
||||
#include "common/net/Transport.h"
|
||||
#include "common/net/ib/IBSocket.h"
|
||||
#include "common/net/ib/RDMABuf.h"
|
||||
#include "common/utils/Address.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Coroutine.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/Semaphore.h"
|
||||
#include "storage/aio/AioReadWorker.h"
|
||||
#include "storage/service/BufferPool.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
#include "storage/update/UpdateWorker.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
|
||||
class StorageOperator {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_OBJ(write_worker, UpdateWorker::Config);
|
||||
CONFIG_OBJ(event_trace_log, analytics::StructuredTraceLog<StorageEventTrace>::Config);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_num_results_per_query, uint32_t{100});
|
||||
CONFIG_HOT_UPDATED_ITEM(batch_read_job_split_size, uint32_t{1024});
|
||||
CONFIG_HOT_UPDATED_ITEM(post_buffer_per_bytes, 64_KB);
|
||||
CONFIG_HOT_UPDATED_ITEM(batch_read_ignore_chain_version, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_writes, 256U);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_reads, 256U);
|
||||
CONFIG_HOT_UPDATED_ITEM(read_only, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(rdma_transmission_req_timeout, 0_ms);
|
||||
CONFIG_HOT_UPDATED_ITEM(apply_transmission_before_getting_semaphore, true);
|
||||
};
|
||||
|
||||
StorageOperator(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
updateWorker_(config_.write_worker()),
|
||||
storageEventTrace_(config.event_trace_log()) {
|
||||
for (const auto &ibdev : net::IBDevice::all()) {
|
||||
concurrentRdmaWriteSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_writes());
|
||||
concurrentRdmaReadSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_reads());
|
||||
}
|
||||
|
||||
onConfigUpdated_ = config_.addCallbackGuard([this]() {
|
||||
for (auto &[_, semaphore] : concurrentRdmaWriteSemaphore_) {
|
||||
semaphore.changeUsableTokens(config_.max_concurrent_rdma_writes());
|
||||
}
|
||||
for (auto &[_, semaphore] : concurrentRdmaReadSemaphore_) {
|
||||
semaphore.changeUsableTokens(config_.max_concurrent_rdma_reads());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Result<Void> init(uint32_t numberOfDisks);
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
CoTryTask<BatchReadRsp> batchRead(ServiceRequestContext &requestCtx,
|
||||
const BatchReadReq &req,
|
||||
serde::CallContext &ctx);
|
||||
|
||||
CoTryTask<WriteRsp> write(ServiceRequestContext &requestCtx, const WriteReq &req, net::IBSocket *ibSocket);
|
||||
|
||||
CoTryTask<UpdateRsp> update(ServiceRequestContext &requestCtx, const UpdateReq &req, net::IBSocket *ibSocket);
|
||||
|
||||
CoTryTask<QueryLastChunkRsp> queryLastChunk(ServiceRequestContext &requestCtx, const QueryLastChunkReq &req);
|
||||
|
||||
CoTryTask<TruncateChunksRsp> truncateChunks(ServiceRequestContext &requestCtx, const TruncateChunksReq &req);
|
||||
|
||||
CoTryTask<RemoveChunksRsp> removeChunks(ServiceRequestContext &requestCtx, const RemoveChunksReq &req);
|
||||
|
||||
CoTryTask<TargetSyncInfo> syncStart(const SyncStartReq &req);
|
||||
|
||||
CoTryTask<SyncDoneRsp> syncDone(const SyncDoneReq &req);
|
||||
|
||||
CoTryTask<SpaceInfoRsp> spaceInfo(const SpaceInfoReq &req);
|
||||
|
||||
CoTryTask<CreateTargetRsp> createTarget(const CreateTargetReq &req);
|
||||
|
||||
CoTryTask<OfflineTargetRsp> offlineTarget(const OfflineTargetReq &req);
|
||||
|
||||
CoTryTask<RemoveTargetRsp> removeTarget(const RemoveTargetReq &req);
|
||||
|
||||
CoTryTask<QueryChunkRsp> queryChunk(const QueryChunkReq &req);
|
||||
|
||||
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(const GetAllChunkMetadataReq &req);
|
||||
|
||||
protected:
|
||||
using ChunkMetadataProcessor = std::function<CoTryTask<void>(const ChunkId &, const ChunkMetadata &)>;
|
||||
|
||||
CoTask<IOResult> handleUpdate(ServiceRequestContext &requestCtx,
|
||||
UpdateReq &req,
|
||||
net::IBSocket *ibSocket,
|
||||
TargetPtr &target);
|
||||
|
||||
CoTask<IOResult> doUpdate(ServiceRequestContext &requestCtx,
|
||||
const UpdateIO &updateIO,
|
||||
const UpdateOptions &updateOptions,
|
||||
uint32_t featureFlags,
|
||||
const std::shared_ptr<StorageTarget> &target,
|
||||
net::IBSocket *ibSocket,
|
||||
BufferPool::Buffer &buffer,
|
||||
net::RDMARemoteBuf &remoteBuf,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
bool allowToAllocate);
|
||||
|
||||
CoTask<IOResult> doCommit(ServiceRequestContext &requestCtx,
|
||||
const CommitIO &commitIO,
|
||||
const UpdateOptions &updateOptions,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
uint32_t featureFlags,
|
||||
const std::shared_ptr<StorageTarget> &target);
|
||||
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> doQuery(ServiceRequestContext &requestCtx,
|
||||
const VersionedChainId &vChainId,
|
||||
const ChunkIdRange &chunkIdRange);
|
||||
|
||||
CoTryTask<uint32_t> processQueryResults(ServiceRequestContext &requestCtx,
|
||||
const VersionedChainId &vChainId,
|
||||
const ChunkIdRange &chunkIdRanges,
|
||||
ChunkMetadataProcessor processor,
|
||||
bool &moreChunksInRange);
|
||||
|
||||
CoTask<IOResult> doTruncate(ServiceRequestContext &requestCtx,
|
||||
const TruncateChunkOp &op,
|
||||
flat::UserInfo userInfo,
|
||||
uint32_t featureFlags);
|
||||
|
||||
CoTask<IOResult> doRemove(ServiceRequestContext &requestCtx,
|
||||
const RemoveChunksOp &op,
|
||||
flat::UserInfo userInfo,
|
||||
uint32_t featureFlags);
|
||||
|
||||
private:
|
||||
friend class ReliableUpdate;
|
||||
|
||||
ConstructLog<"storage::StorageOperator"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
UpdateWorker updateWorker_;
|
||||
analytics::StructuredTraceLog<StorageEventTrace> storageEventTrace_;
|
||||
std::unique_ptr<ConfigCallbackGuard> onConfigUpdated_;
|
||||
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaWriteSemaphore_;
|
||||
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaReadSemaphore_;
|
||||
std::atomic<uint64_t> totalReadBytes_{};
|
||||
std::atomic<uint64_t> totalReadIOs_{};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
60
src/storage/service/StorageServer.cc
Normal file
60
src/storage/service/StorageServer.cc
Normal file
@@ -0,0 +1,60 @@
|
||||
#include "storage/service/StorageServer.h"
|
||||
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/experimental/coro/WithCancellation.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "common/kv/mem/MemKVEngine.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "core/service/CoreService.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/StorageService.h"
|
||||
#include "stubs/common/RealStubFactory.h"
|
||||
#include "stubs/mgmtd/MgmtdServiceStub.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
StorageServer::StorageServer(const Components::Config &config)
|
||||
: net::Server(config.base()),
|
||||
components_(config) {}
|
||||
|
||||
StorageServer::~StorageServer() {
|
||||
stopAndJoin();
|
||||
XLOGF(INFO, "Destructor StorageServer");
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::beforeStart() {
|
||||
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<StorageService>(components_.storageOperator), true));
|
||||
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<core::CoreService>()));
|
||||
groups().front()->setCoroutinesPoolGetter([this](const serde::MessagePacket<> &packet) -> DynamicCoroutinesPool & {
|
||||
switch (packet.serviceId) {
|
||||
case StorageSerde<>::kServiceID:
|
||||
return components_.getCoroutinesPool(packet.methodId);
|
||||
default:
|
||||
return components_.defaultPool;
|
||||
}
|
||||
});
|
||||
RETURN_AND_LOG_ON_ERROR(components_.start(appInfo(), tpg()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::beforeStop() {
|
||||
components_.reliableUpdate.beforeStop();
|
||||
components_.reliableForwarding.beforeStop();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageServer::afterStop() {
|
||||
RETURN_AND_LOG_ON_ERROR(components_.stopAndJoin(tpg().procThreadPool()));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
hf3fs::Result<Void> StorageServer::start(const flat::AppInfo &info,
|
||||
std::unique_ptr<::hf3fs::net::Client> client,
|
||||
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient) {
|
||||
components_.netClient = std::move(client);
|
||||
components_.mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForServer>(std::move(mgmtdClient));
|
||||
return net::Server::start(info);
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
59
src/storage/service/StorageServer.h
Normal file
59
src/storage/service/StorageServer.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/CancellationToken.h>
|
||||
|
||||
#include "client/mgmtd/MgmtdClientForServer.h"
|
||||
#include "common/net/Server.h"
|
||||
#include "core/app/ServerAppConfig.h"
|
||||
#include "core/app/ServerLauncher.h"
|
||||
#include "core/app/ServerLauncherConfig.h"
|
||||
#include "core/app/ServerMgmtdClientFetcher.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/service/ReliableForwarding.h"
|
||||
#include "storage/service/ReliableUpdate.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::test {
|
||||
struct StorageServerHelper;
|
||||
}
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageServer : public net::Server {
|
||||
public:
|
||||
static constexpr auto kName = "Storage";
|
||||
static constexpr auto kNodeType = flat::NodeType::STORAGE;
|
||||
|
||||
using AppConfig = core::ServerAppConfig;
|
||||
struct LauncherConfig : public core::ServerLauncherConfig {
|
||||
LauncherConfig() { mgmtd_client() = hf3fs::client::MgmtdClientForServer::Config{}; }
|
||||
};
|
||||
using RemoteConfigFetcher = core::launcher::ServerMgmtdClientFetcher;
|
||||
using Launcher = core::ServerLauncher<StorageServer>;
|
||||
|
||||
using CommonConfig = ApplicationBase::Config;
|
||||
using Config = Components::Config;
|
||||
StorageServer(const Components::Config &config);
|
||||
~StorageServer() override;
|
||||
|
||||
// set up storage server.
|
||||
Result<Void> beforeStart() final;
|
||||
|
||||
// before server stop.
|
||||
Result<Void> beforeStop() final;
|
||||
|
||||
// tear down storage server.
|
||||
Result<Void> afterStop() final;
|
||||
|
||||
using net::Server::start;
|
||||
hf3fs::Result<Void> start(const flat::AppInfo &info,
|
||||
std::unique_ptr<::hf3fs::net::Client> client,
|
||||
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient);
|
||||
|
||||
private:
|
||||
friend struct test::StorageServerHelper;
|
||||
ConstructLog<"storage::StorageServer"> constructLog_;
|
||||
Components components_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
32
src/storage/service/StorageService.cc
Normal file
32
src/storage/service/StorageService.cc
Normal file
@@ -0,0 +1,32 @@
|
||||
#include "storage/service/StorageService.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::LatencyRecorder readQueueLatency{"storage.read.queue_latency"};
|
||||
monitor::LatencyRecorder updateQueueLatency{"storage.update.queue_latency"};
|
||||
monitor::LatencyRecorder defaultQueueLatency{"storage.default.queue_latency"};
|
||||
|
||||
} // namespace
|
||||
|
||||
void StorageService::reportReadQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
readQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
void StorageService::reportUpdateQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
updateQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
void StorageService::reportDefaultQueueLatency(serde::CallContext &ctx) {
|
||||
if (ctx.packet().timestamp) {
|
||||
defaultQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
111
src/storage/service/StorageService.h
Normal file
111
src/storage/service/StorageService.h
Normal file
@@ -0,0 +1,111 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/serde/CallContext.h"
|
||||
#include "fbs/storage/Service.h"
|
||||
#include "storage/service/StorageOperator.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageService : public serde::ServiceWrapper<StorageService, storage::StorageSerde> {
|
||||
public:
|
||||
StorageService(StorageOperator &storageOperator)
|
||||
: storageOperator_(storageOperator) {}
|
||||
|
||||
CoTryTask<BatchReadRsp> batchRead(serde::CallContext &ctx, const BatchReadReq &req) {
|
||||
reportReadQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return BatchReadRsp{.tag = req.tag};
|
||||
ServiceRequestContext requestCtx{"batchRead", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.batchRead(requestCtx, req, ctx);
|
||||
}
|
||||
|
||||
CoTryTask<WriteRsp> write(serde::CallContext &ctx, const WriteReq &req) {
|
||||
reportUpdateQueueLatency(ctx);
|
||||
ServiceRequestContext requestCtx{"write", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.write(requestCtx, req, ctx.transport()->ibSocket());
|
||||
}
|
||||
|
||||
CoTryTask<UpdateRsp> update(serde::CallContext &ctx, const UpdateReq &req) {
|
||||
reportUpdateQueueLatency(ctx);
|
||||
ServiceRequestContext requestCtx{"update", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.update(requestCtx, req, ctx.transport()->ibSocket());
|
||||
}
|
||||
|
||||
CoTryTask<QueryLastChunkRsp> queryLastChunk(serde::CallContext &ctx, const QueryLastChunkReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return QueryLastChunkRsp{};
|
||||
ServiceRequestContext requestCtx{"queryLastChunk", req.tag, req.retryCount, req.userInfo, req.debugFlags};
|
||||
co_return co_await storageOperator_.queryLastChunk(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<TruncateChunksRsp> truncateChunks(serde::CallContext &ctx, const TruncateChunksReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return TruncateChunksRsp{};
|
||||
ServiceRequestContext requestCtx{"truncateChunks",
|
||||
req.payloads.front().tag,
|
||||
req.payloads.front().retryCount,
|
||||
req.userInfo,
|
||||
req.debugFlags};
|
||||
co_return co_await storageOperator_.truncateChunks(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<RemoveChunksRsp> removeChunks(serde::CallContext &ctx, const RemoveChunksReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
if (UNLIKELY(req.payloads.empty())) co_return RemoveChunksRsp{};
|
||||
ServiceRequestContext requestCtx{"removeChunks",
|
||||
req.payloads.front().tag,
|
||||
req.payloads.front().retryCount,
|
||||
req.userInfo,
|
||||
req.debugFlags};
|
||||
co_return co_await storageOperator_.removeChunks(requestCtx, req);
|
||||
}
|
||||
|
||||
CoTryTask<TargetSyncInfo> syncStart(serde::CallContext &ctx, const SyncStartReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.syncStart(req);
|
||||
}
|
||||
|
||||
CoTryTask<SyncDoneRsp> syncDone(serde::CallContext &ctx, const SyncDoneReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.syncDone(req);
|
||||
}
|
||||
|
||||
CoTryTask<SpaceInfoRsp> spaceInfo(serde::CallContext &ctx, const SpaceInfoReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.spaceInfo(req);
|
||||
}
|
||||
|
||||
CoTryTask<CreateTargetRsp> createTarget(serde::CallContext &ctx, const CreateTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.createTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<OfflineTargetRsp> offlineTarget(serde::CallContext &ctx, const OfflineTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.offlineTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<RemoveTargetRsp> removeTarget(serde::CallContext &ctx, const RemoveTargetReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.removeTarget(req);
|
||||
}
|
||||
|
||||
CoTryTask<QueryChunkRsp> queryChunk(serde::CallContext &ctx, const QueryChunkReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.queryChunk(req);
|
||||
}
|
||||
|
||||
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(serde::CallContext &ctx, const GetAllChunkMetadataReq &req) {
|
||||
reportDefaultQueueLatency(ctx);
|
||||
return storageOperator_.getAllChunkMetadata(req);
|
||||
}
|
||||
|
||||
private:
|
||||
void reportReadQueueLatency(serde::CallContext &ctx);
|
||||
void reportUpdateQueueLatency(serde::CallContext &ctx);
|
||||
void reportDefaultQueueLatency(serde::CallContext &ctx);
|
||||
|
||||
private:
|
||||
StorageOperator &storageOperator_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
418
src/storage/service/TargetMap.cc
Normal file
418
src/storage/service/TargetMap.cc
Normal file
@@ -0,0 +1,418 @@
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder updateRoutingRecorder{"storage.update_routing"};
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<net::Address> Target::getSuccessorAddr() const {
|
||||
if (UNLIKELY(!successor.has_value())) {
|
||||
return makeError(StorageCode::kNoSuccessorTarget);
|
||||
}
|
||||
auto &serviceGroups = successor->nodeInfo.app.serviceGroups;
|
||||
if (UNLIKELY(serviceGroups.empty())) {
|
||||
auto msg = fmt::format("target {} successor service groups is empty", *this);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
|
||||
}
|
||||
auto &endpoints = serviceGroups.front().endpoints;
|
||||
if (UNLIKELY(endpoints.empty())) {
|
||||
auto msg = fmt::format("target {} successor service endpoints is empty", *this);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
|
||||
}
|
||||
return endpoints.front();
|
||||
}
|
||||
|
||||
Result<TargetId> TargetMap::getTargetId(ChainId chainId) const {
|
||||
auto chainToTargetIt = chainToTarget_.find(chainId);
|
||||
if (UNLIKELY(chainToTargetIt == chainToTarget_.end())) {
|
||||
auto msg = fmt::format("chain {} not found", chainId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return chainToTargetIt->second;
|
||||
}
|
||||
|
||||
Result<const Target *> TargetMap::getTarget(TargetId targetId) const {
|
||||
auto targetsIt = targets_.find(targetId);
|
||||
if (UNLIKELY(targetsIt == targets_.end())) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return &targetsIt->second;
|
||||
}
|
||||
|
||||
Result<const Target *> TargetMap::getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const {
|
||||
CHECK_RESULT(targetId, getTargetId(vChainId.chainId));
|
||||
CHECK_RESULT(target, getTarget(targetId));
|
||||
if (target->vChainId != vChainId && (!allowOutdatedChainVer || vChainId.chainVer > target->vChainId.chainVer)) {
|
||||
auto msg = fmt::format("chain {} version mismatch request {} != local {}",
|
||||
vChainId.chainId,
|
||||
vChainId.chainVer,
|
||||
target->vChainId.chainVer);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingVersionMismatch, std::move(msg));
|
||||
}
|
||||
if (target->localState == flat::LocalTargetState::OFFLINE) {
|
||||
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kTargetOffline, std::move(msg));
|
||||
}
|
||||
if (target->storageTarget == nullptr) {
|
||||
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
|
||||
XLOG(CRITICAL, msg);
|
||||
return makeError(StorageCode::kTargetOffline, std::move(msg));
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget) {
|
||||
auto targetId = storageTarget->targetId();
|
||||
Target target;
|
||||
target.storageTarget = storageTarget;
|
||||
target.targetId = targetId;
|
||||
target.chainId = storageTarget->chainId();
|
||||
target.path = storageTarget->path();
|
||||
target.localState = flat::LocalTargetState::ONLINE;
|
||||
target.diskIndex = storageTarget->diskIndex();
|
||||
target.useChunkEngine = storageTarget->useChunkEngine();
|
||||
auto [it, succ] = targets_.emplace(targetId, target);
|
||||
if (UNLIKELY(!succ)) {
|
||||
if (it->second.localState == flat::LocalTargetState::OFFLINE) {
|
||||
it->second = std::move(target);
|
||||
return Void{};
|
||||
}
|
||||
auto msg = fmt::format("target {} already exists", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kTargetStateInvalid, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Target *> TargetMap::getMutableTarget(TargetId targetId) {
|
||||
auto targetsIt = targets_.find(targetId);
|
||||
if (UNLIKELY(targetsIt == targets_.end())) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return &targetsIt->second;
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::syncReceiveDone(VersionedChainId chainId) {
|
||||
CHECK_RESULT(constTarget, getByChainId(chainId, false));
|
||||
auto targetId = constTarget->targetId;
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
XLOGF(WARNING,
|
||||
"chain {} target {} sync receive done {} -> UPTODATE",
|
||||
chainId,
|
||||
targetId,
|
||||
magic_enum::enum_name(target->localState));
|
||||
target->localState = flat::LocalTargetState::UPTODATE;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log /* = true */) {
|
||||
auto recordGuard = updateRoutingRecorder.record();
|
||||
|
||||
if (UNLIKELY(r == nullptr)) {
|
||||
XLOGF(ERR, "routing info is empty");
|
||||
return makeError(StorageClientCode::kRoutingError, "routing info is empty");
|
||||
}
|
||||
auto &routingInfo = r->raw();
|
||||
if (routingInfoVersion_ > routingInfo->routingInfoVersion) {
|
||||
auto msg = fmt::format("routing info expired! {} > {}", routingInfoVersion_, routingInfo->routingInfoVersion);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
XLOGF(INFO, "routing info updated, {} -> {}", routingInfoVersion_, routingInfo->routingInfoVersion);
|
||||
|
||||
// 1. reset current state.
|
||||
routingInfoVersion_ = routingInfo->routingInfoVersion;
|
||||
chainToTarget_.clear();
|
||||
syncingChains_.clear();
|
||||
robin_hood::unordered_set<TargetId> headTargets;
|
||||
robin_hood::unordered_set<TargetId> tailTargets;
|
||||
robin_hood::unordered_set<TargetId> lastSrvTargets;
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (target.isHead) {
|
||||
headTargets.insert(target.targetId);
|
||||
}
|
||||
if (target.isTail) {
|
||||
tailTargets.insert(target.targetId);
|
||||
}
|
||||
if (target.publicState == flat::PublicTargetState::LASTSRV) {
|
||||
lastSrvTargets.insert(target.targetId);
|
||||
}
|
||||
target.isHead = false;
|
||||
target.isTail = false;
|
||||
target.vChainId = VersionedChainId{};
|
||||
target.publicState = flat::PublicTargetState::INVALID;
|
||||
target.successor = std::nullopt;
|
||||
}
|
||||
bool invalidRoutingInfo = false;
|
||||
auto invalidRoutingInfoLogGuard = folly::makeGuard([&] {
|
||||
if (invalidRoutingInfo) {
|
||||
XLOGF(CRITICAL, "invalid routing info: {}", *routingInfo);
|
||||
}
|
||||
});
|
||||
|
||||
// 2. iterate routing info.
|
||||
for (auto &[id, chain] : routingInfo->chains) {
|
||||
// 3. find target in chain.
|
||||
auto it = std::find_if(chain.targets.begin(), chain.targets.end(), [&](const flat::ChainTargetInfo &targetInfo) {
|
||||
return bool(getMutableTarget(targetInfo.targetId));
|
||||
});
|
||||
if (it == chain.targets.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 4. find target info.
|
||||
auto targetId = it->targetId;
|
||||
auto targetInfo = routingInfo->getTarget(targetId);
|
||||
if (UNLIKELY(!targetInfo)) {
|
||||
auto msg = fmt::format("targetInfo id {} not found", targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
// 5. update local target.
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
bool targetIsServing = targetInfo->publicState == flat::PublicTargetState::SERVING ||
|
||||
targetInfo->publicState == flat::PublicTargetState::SYNCING;
|
||||
auto previousLocalState = target->localState;
|
||||
target->isHead = (targetIsServing && it == chain.targets.begin());
|
||||
target->vChainId = VersionedChainId{chain.chainId, chain.chainVersion};
|
||||
if (target->storageTarget != nullptr) {
|
||||
if (target->storageTarget->chainId() == ChainId{}) {
|
||||
RETURN_AND_LOG_ON_ERROR(target->storageTarget->setChainId(chain.chainId));
|
||||
}
|
||||
if (target->storageTarget->chainId() != chain.chainId) {
|
||||
auto msg = fmt::format("target.chain != routing.chain, target {}, chain {}", *target, chain);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
}
|
||||
target->localState = updateLocalState(targetId, previousLocalState, targetInfo->publicState);
|
||||
target->publicState = targetInfo->publicState;
|
||||
auto [chainToTargetIt, succ] = chainToTarget_.emplace(chain.chainId, targetId);
|
||||
if (!succ) {
|
||||
auto msg = fmt::format("chain {} map to 2 targets {}, {}", chain.chainId, chainToTargetIt->second, targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
|
||||
if (previousLocalState != flat::LocalTargetState::OFFLINE &&
|
||||
target->localState == flat::LocalTargetState::OFFLINE) {
|
||||
target->weakStorageTarget = target->storageTarget->aliveWeakPtr();
|
||||
target->storageTarget = nullptr;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 6. update successor.
|
||||
while (targetIsServing && ++it != chain.targets.end()) {
|
||||
auto targetInfo = routingInfo->getTarget(it->targetId);
|
||||
if (UNLIKELY(!targetInfo)) {
|
||||
auto msg = fmt::format("successor {} not found", it->targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
if (targetInfo->publicState == flat::PublicTargetState::SERVING) {
|
||||
target->successor = Successor{{}, *targetInfo};
|
||||
} else if (targetInfo->publicState == flat::PublicTargetState::SYNCING) {
|
||||
target->successor = Successor{{}, *targetInfo};
|
||||
syncingChains_.push_back(VersionedChainId{chain.chainId, chain.chainVersion});
|
||||
}
|
||||
|
||||
if (target->successor) {
|
||||
if (!targetInfo->nodeId.has_value()) {
|
||||
XLOGF(WARNING, "target {} node id is nullopt", it->targetId);
|
||||
break;
|
||||
}
|
||||
auto node = routingInfo->getNode(*targetInfo->nodeId);
|
||||
if (!node) {
|
||||
XLOGF(WARNING, "node {} not found", targetInfo->nodeId);
|
||||
break;
|
||||
}
|
||||
target->successor->nodeInfo = *node;
|
||||
if (UNLIKELY(target->successor->nodeInfo.app.serviceGroups.empty())) {
|
||||
XLOGF(CRITICAL, "successor invalid! chain {}, successor {}, node {}", chain.chainId, *targetInfo, *node);
|
||||
invalidRoutingInfo = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
target->isTail = (targetIsServing && !target->successor.has_value());
|
||||
|
||||
if (headTargets.contains(targetId) ^ target->isHead) {
|
||||
if (target->isHead) {
|
||||
XLOGF_IF(WARNING, log, "target {} becomes head", targetId);
|
||||
} else {
|
||||
XLOGF_IF(WARNING, log, "target {} is no longer head", targetId);
|
||||
}
|
||||
}
|
||||
if (tailTargets.contains(targetId) ^ target->isTail) {
|
||||
if (target->isTail) {
|
||||
XLOGF_IF(WARNING, log, "target {} becomes tail", targetId);
|
||||
} else {
|
||||
XLOGF_IF(WARNING, log, "target {} is no longer tail", targetId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (lastSrvTargets.contains(targetId) && target.storageTarget &&
|
||||
(target.publicState == flat::PublicTargetState::SERVING ||
|
||||
target.publicState == flat::PublicTargetState::SYNCING ||
|
||||
target.publicState == flat::PublicTargetState::WAITING)) {
|
||||
target.storageTarget->resetUncommitted(target.vChainId.chainVer);
|
||||
}
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::removeTarget(TargetId targetId) {
|
||||
auto succ = targets_.erase(targetId);
|
||||
if (succ != 1) {
|
||||
auto msg = fmt::format("target {} not found", targetId);
|
||||
return makeError(StorageClientCode::kRoutingError, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::offlineTarget(TargetId targetId) {
|
||||
CHECK_RESULT(target, getMutableTarget(targetId));
|
||||
if (target->unrecoverableOffline()) {
|
||||
return makeError(StorageCode::kTargetOffline, fmt::format("target is already offline, {}.", *target));
|
||||
}
|
||||
|
||||
target->offlineUponUserRequest = true;
|
||||
target->localState = flat::LocalTargetState::OFFLINE;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::offlineTargets(const Path &path) {
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
|
||||
target.diskError = true;
|
||||
target.localState = flat::LocalTargetState::OFFLINE;
|
||||
XLOGF(WARNING, "offline target {} because of disk error", target.path);
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> TargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
|
||||
for (auto &[targetId, target] : targets_) {
|
||||
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
|
||||
target.lowSpace = lowSpace;
|
||||
auto old = std::exchange(target.rejectCreateChunk, rejectCreateChunk);
|
||||
if (old != rejectCreateChunk) {
|
||||
XLOGF(WARNING, "target {} reject create chunk {} -> {}", target.path, old, rejectCreateChunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
hf3fs::flat::LocalTargetState TargetMap::updateLocalState(TargetId targetId,
|
||||
hf3fs::flat::LocalTargetState localState,
|
||||
hf3fs::flat::PublicTargetState publicState) {
|
||||
if (localState == hf3fs::flat::LocalTargetState::UPTODATE &&
|
||||
(publicState == hf3fs::flat::PublicTargetState::OFFLINE ||
|
||||
publicState == hf3fs::flat::PublicTargetState::LASTSRV ||
|
||||
publicState == hf3fs::flat::PublicTargetState::WAITING)) {
|
||||
XLOGF(CRITICAL,
|
||||
"move to offline state (shutdown), local target: {}, local state: {} -> OFFLINE, public state: {}",
|
||||
targetId,
|
||||
magic_enum::enum_name(localState),
|
||||
magic_enum::enum_name(publicState));
|
||||
return hf3fs::flat::LocalTargetState::OFFLINE;
|
||||
} else if (localState == hf3fs::flat::LocalTargetState::ONLINE &&
|
||||
publicState == hf3fs::flat::PublicTargetState::SERVING) {
|
||||
XLOGF(INFO,
|
||||
"move to up-to-date state, local target: {}, local state: {} -> UPTODATE, public state: {}",
|
||||
targetId,
|
||||
magic_enum::enum_name(localState),
|
||||
magic_enum::enum_name(publicState));
|
||||
return hf3fs::flat::LocalTargetState::UPTODATE;
|
||||
}
|
||||
return localState;
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByChainId(
|
||||
VersionedChainId vChainId,
|
||||
bool allowOutdatedChainVer /* = false */) const {
|
||||
auto map = snapshot();
|
||||
auto result = map->getByChainId(vChainId, allowOutdatedChainVer);
|
||||
RETURN_ON_ERROR(result);
|
||||
return std::shared_ptr<const Target>(std::move(map), *result);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByTargetId(TargetId targetId) const {
|
||||
auto map = snapshot();
|
||||
auto result = map->getTarget(targetId);
|
||||
RETURN_ON_ERROR(result);
|
||||
return std::shared_ptr<const Target>(std::move(map), *result);
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateTargetMap(auto &&updateFunc) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
auto map = snapshot();
|
||||
while (true) {
|
||||
auto newMap = map->clone();
|
||||
RETURN_AND_LOG_ON_ERROR(updateFunc(newMap));
|
||||
if (targetMap_.compare_exchange_strong(map, std::move(newMap))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
updateCallback_(*snapshot());
|
||||
return Void{};
|
||||
};
|
||||
|
||||
Result<Void> AtomicallyTargetMap::addStorageTarget(std::shared_ptr<StorageTarget> storageTarget) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->addStorageTarget(storageTarget); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::syncReceiveDone(VersionedChainId vChainId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->syncReceiveDone(vChainId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateRouting(r); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::removeTarget(TargetId targetId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->removeTarget(targetId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::offlineTarget(TargetId targetId) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTarget(targetId); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::offlineTargets(const Path &path) {
|
||||
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTargets(path); });
|
||||
}
|
||||
|
||||
Result<Void> AtomicallyTargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
|
||||
return updateTargetMap(
|
||||
[&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateDiskState(path, lowSpace, rejectCreateChunk); });
|
||||
}
|
||||
|
||||
void AtomicallyTargetMap::updateTargetUsedSize() {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
updateCallback_(*snapshot());
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
137
src/storage/service/TargetMap.h
Normal file
137
src/storage/service/TargetMap.h
Normal file
@@ -0,0 +1,137 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <common/utils/RobinHood.h>
|
||||
#include <folly/concurrency/AtomicSharedPtr.h>
|
||||
#include <memory>
|
||||
|
||||
#include "client/mgmtd/RoutingInfo.h"
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/ConstructLog.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
#include "fbs/mgmtd/NodeInfo.h"
|
||||
#include "fbs/mgmtd/TargetInfo.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/store/StorageTarget.h"
|
||||
|
||||
namespace hf3fs::test {
|
||||
struct TargetMapHelper;
|
||||
}
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class TargetMap {
|
||||
public:
|
||||
// [observers] clone current map.
|
||||
std::shared_ptr<TargetMap> clone() const { return std::make_shared<TargetMap>(*this); }
|
||||
|
||||
// [observers] get target id by chain id.
|
||||
Result<TargetId> getTargetId(ChainId chainId) const;
|
||||
|
||||
// [observers] get target by target id.
|
||||
Result<const Target *> getTarget(TargetId targetId) const;
|
||||
|
||||
// [observers] get target by versioned chain id.
|
||||
Result<const Target *> getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const;
|
||||
|
||||
// [observers]
|
||||
auto &getTargets() const { return targets_; }
|
||||
|
||||
// [observers]
|
||||
auto &syncingChains() const { return syncingChains_; }
|
||||
|
||||
// [modifiers] add a new target.
|
||||
Result<Void> addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget);
|
||||
|
||||
// [modifiers] get target by target id.
|
||||
Result<Target *> getMutableTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] sync receive is started.
|
||||
Result<Void> syncReceiveDone(VersionedChainId vChainId);
|
||||
|
||||
// [modifiers] update by routing info.
|
||||
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log = true);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> removeTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> offlineTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set targets in path as offline.
|
||||
Result<Void> offlineTargets(const Path &path);
|
||||
|
||||
// [modifiers] reject create chunk for targets in path.
|
||||
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
|
||||
|
||||
// update local state.
|
||||
static hf3fs::flat::LocalTargetState updateLocalState(TargetId targetId,
|
||||
hf3fs::flat::LocalTargetState localState,
|
||||
hf3fs::flat::PublicTargetState publicState);
|
||||
|
||||
private:
|
||||
friend struct test::TargetMapHelper;
|
||||
robin_hood::unordered_map<TargetId, Target> targets_;
|
||||
flat::RoutingInfoVersion routingInfoVersion_;
|
||||
robin_hood::unordered_map<ChainId, TargetId> chainToTarget_;
|
||||
std::vector<VersionedChainId> syncingChains_;
|
||||
};
|
||||
|
||||
class AtomicallyTargetMap {
|
||||
public:
|
||||
// [observers] get a snapshot of target map.
|
||||
auto snapshot() const { return targetMap_.load(); }
|
||||
|
||||
// [observers] get target by chain id.
|
||||
Result<std::shared_ptr<const Target>> getByChainId(VersionedChainId vChainId,
|
||||
bool allowOutdatedChainVer = false) const;
|
||||
|
||||
// [observers] get target by its id.
|
||||
Result<std::shared_ptr<const Target>> getByTargetId(TargetId targetId) const;
|
||||
|
||||
// [modifiers] set update callback.
|
||||
void setUpdateCallback(auto &&func) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
updateCallback_ = std::forward<decltype(func)>(func);
|
||||
}
|
||||
|
||||
// [modifiers] add a target.
|
||||
Result<Void> addStorageTarget(std::shared_ptr<StorageTarget> storageTarget);
|
||||
|
||||
// [modifiers] sync receive is done.
|
||||
Result<Void> syncReceiveDone(VersionedChainId vChainId);
|
||||
|
||||
// [modifiers] update by routing info.
|
||||
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> removeTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set target as offline.
|
||||
Result<Void> offlineTarget(TargetId targetId);
|
||||
|
||||
// [modifiers] set targets in path as offline.
|
||||
Result<Void> offlineTargets(const Path &path);
|
||||
|
||||
// [modifiers] reject create chunk for targets in path.
|
||||
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
|
||||
|
||||
// [modifiers] update target used size.
|
||||
void updateTargetUsedSize();
|
||||
|
||||
// [modifiers] release target map.
|
||||
auto release() { return targetMap_.exchange(nullptr); }
|
||||
|
||||
protected:
|
||||
// [modifiers] update target map atomically.
|
||||
Result<Void> updateTargetMap(auto &&updateFunc);
|
||||
|
||||
private:
|
||||
friend struct test::TargetMapHelper;
|
||||
ConstructLog<"storage::AtomicallyTargetMap"> constructLog_;
|
||||
std::mutex mutex_; // for update operation.
|
||||
std::function<void(const TargetMap &)> updateCallback_ = [](auto) {};
|
||||
folly::atomic_shared_ptr<const TargetMap> targetMap_{std::make_shared<const TargetMap>()};
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
8
src/storage/storage.cpp
Normal file
8
src/storage/storage.cpp
Normal file
@@ -0,0 +1,8 @@
|
||||
#include "common/app/TwoPhaseApplication.h"
|
||||
#include "memory/common/OverrideCppNewDelete.h"
|
||||
#include "storage/service/StorageServer.h"
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
using namespace hf3fs;
|
||||
return TwoPhaseApplication<storage::StorageServer>().run(argc, argv);
|
||||
}
|
||||
112
src/storage/store/ChunkEngine.cc
Normal file
112
src/storage/store/ChunkEngine.cc
Normal file
@@ -0,0 +1,112 @@
|
||||
#include "ChunkEngine.h"
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder storageUpdateRecorder{"storage.engine_update"};
|
||||
monitor::OperationRecorder storageCommitRecorder{"storage.engine_commit"};
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<uint32_t> ChunkEngine::update(chunk_engine::Engine &engine, UpdateJob &job) {
|
||||
auto recordGuard = storageUpdateRecorder.record();
|
||||
|
||||
// 1. prepare.
|
||||
const auto &updateIO = job.updateIO();
|
||||
const auto &chunkId = updateIO.key.chunkId;
|
||||
const auto &options = job.options();
|
||||
const auto &state = job.state();
|
||||
auto &result = job.result();
|
||||
|
||||
auto chainId = updateIO.key.vChainId.chainId;
|
||||
std::string key;
|
||||
key.reserve(sizeof(chainId) + chunkId.data().size());
|
||||
key.append((const char *)&chainId, sizeof(chainId));
|
||||
key.append(chunkId.data());
|
||||
|
||||
// 2. start update.
|
||||
chunk_engine::UpdateReq req{};
|
||||
if (updateIO.isTruncate()) {
|
||||
req.is_truncate = true;
|
||||
} else if (updateIO.isRemove()) {
|
||||
req.is_remove = true;
|
||||
}
|
||||
req.is_syncing = options.isSyncing;
|
||||
req.update_ver = updateIO.updateVer;
|
||||
req.chain_ver = job.commitChainVer();
|
||||
if (updateIO.checksum.type == ChecksumType::CRC32C) {
|
||||
req.checksum = ~updateIO.checksum.value;
|
||||
} else if (state.data) {
|
||||
req.without_checksum = true;
|
||||
}
|
||||
if (updateIO.isWrite()) {
|
||||
req.length = updateIO.length;
|
||||
req.offset = updateIO.offset;
|
||||
} else {
|
||||
req.length = 0;
|
||||
req.offset = updateIO.length;
|
||||
}
|
||||
req.data = reinterpret_cast<uint64_t>(state.data);
|
||||
req.last_request_id = job.requestCtx().tag.requestId;
|
||||
auto clientId = job.requestCtx().tag.clientId.uuid.asStringView();
|
||||
req.last_client_low = *(const uint64_t *)clientId.data();
|
||||
req.last_client_high = *(const uint64_t *)(clientId.data() + 8);
|
||||
|
||||
std::string error{};
|
||||
auto chunk = engine.update_raw_chunk(toSlice(key), req, error);
|
||||
result.updateVer = result.commitVer = ChunkVer{req.out_commit_ver};
|
||||
result.commitChainVer = ChainVer{req.out_chain_ver};
|
||||
if (req.is_remove && req.out_non_existent) {
|
||||
result.checksum = ChecksumInfo{ChecksumType::NONE, 0};
|
||||
} else {
|
||||
result.checksum = ChecksumInfo{ChecksumType::CRC32C, ~req.out_checksum};
|
||||
}
|
||||
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(req.out_error_code, std::move(error));
|
||||
}
|
||||
|
||||
job.chunkEngineJob().set(engine, chunk);
|
||||
|
||||
recordGuard.succ();
|
||||
if (updateIO.isTruncate() || updateIO.isExtend()) {
|
||||
return chunk->raw_meta().len;
|
||||
}
|
||||
return updateIO.length;
|
||||
}
|
||||
|
||||
Result<uint32_t> ChunkEngine::commit(chunk_engine::Engine &engine, UpdateJob &job, bool sync) {
|
||||
auto recordGuard = storageCommitRecorder.record();
|
||||
|
||||
const auto &commitIO = job.commitIO();
|
||||
const auto &chunkId = commitIO.key.chunkId;
|
||||
auto &result = job.result();
|
||||
|
||||
auto chainId = commitIO.key.vChainId.chainId;
|
||||
std::string key;
|
||||
key.reserve(sizeof(chainId) + chunkId.data().size());
|
||||
key.append((const char *)&chainId, sizeof(chainId));
|
||||
key.append(chunkId.data());
|
||||
|
||||
auto chunk = job.chunkEngineJob().chunk();
|
||||
chunk->set_chain_ver(job.commitChainVer());
|
||||
auto &meta = chunk->raw_meta();
|
||||
result.updateVer = result.commitVer = ChunkVer{meta.chunk_ver};
|
||||
result.commitChainVer = ChainVer{meta.chain_ver};
|
||||
|
||||
std::string error;
|
||||
engine.commit_raw_chunk(chunk, sync, error);
|
||||
job.chunkEngineJob().release();
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataSetError, std::move(error));
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return uint32_t{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
267
src/storage/store/ChunkEngine.h
Normal file
267
src/storage/store/ChunkEngine.h
Normal file
@@ -0,0 +1,267 @@
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct ChunkEngine {
|
||||
static void copyMeta(const chunk_engine::RawMeta &in, ChunkMetadata &out) {
|
||||
out.commitVer = ChunkVer{in.chunk_ver};
|
||||
out.updateVer = ChunkVer{in.chunk_ver};
|
||||
out.chainVer = ChainVer{in.chain_ver};
|
||||
out.size = in.len;
|
||||
out.chunkState = ChunkState::COMMIT;
|
||||
out.recycleState = RecycleState::NORMAL;
|
||||
out.checksumType = ChecksumType::CRC32C;
|
||||
out.checksumValue = ~in.checksum;
|
||||
out.innerFileId = ChunkFileId{std::max(uint32_t(in.pos >> 48 << 16), 512u * 1024), 256};
|
||||
out.innerOffset = in.pos;
|
||||
out.timestamp = UtcTime::fromMicroseconds(in.timestamp);
|
||||
out.lastRequestId = RequestId{in.last_request_id};
|
||||
out.lastClientUuid = Uuid::from(in.last_client_low, in.last_client_high);
|
||||
}
|
||||
|
||||
static rust::Slice<const uint8_t> toSlice(const std::string &key) {
|
||||
return rust::Slice<const uint8_t>{(const uint8_t *)key.data(), key.size()};
|
||||
}
|
||||
|
||||
static Result<Void> aioPrepareRead(chunk_engine::Engine &engine, AioReadJob &job) {
|
||||
auto &state = job.state();
|
||||
|
||||
if (!state.chunkEngineJob.has_chunk()) {
|
||||
const auto &chunkId = job.readIO().key.chunkId;
|
||||
auto chainId = job.readIO().key.vChainId.chainId;
|
||||
|
||||
std::string key;
|
||||
key.reserve(sizeof(chainId) + chunkId.data().size());
|
||||
key.append((const char *)&chainId, sizeof(chainId));
|
||||
key.append(chunkId.data());
|
||||
|
||||
std::string error;
|
||||
auto chunk = engine.get_raw_chunk(toSlice(key), error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
if (chunk == nullptr) {
|
||||
return makeError(StorageCode::kChunkMetadataNotFound);
|
||||
}
|
||||
|
||||
state.chunkEngineJob.set(&engine, chunk);
|
||||
}
|
||||
|
||||
auto &result = job.result();
|
||||
auto &meta = state.chunkEngineJob.chunk()->raw_meta();
|
||||
result.commitVer = ChunkVer{meta.chunk_ver};
|
||||
result.updateVer = ChunkVer{meta.chunk_ver};
|
||||
result.commitChainVer = ChainVer{meta.chain_ver};
|
||||
state.chunkLen = meta.len;
|
||||
state.chunkChecksum = ChecksumInfo{ChecksumType::CRC32C, ~meta.checksum};
|
||||
|
||||
auto chunkInfo = state.chunkEngineJob.chunk()->fd_and_offset();
|
||||
state.readLength = job.alignedLength();
|
||||
state.readFd = chunkInfo.fd;
|
||||
state.readOffset = chunkInfo.offset + job.alignedOffset();
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static Result<uint32_t> update(chunk_engine::Engine &engine, UpdateJob &job);
|
||||
|
||||
static Result<uint32_t> commit(chunk_engine::Engine &engine, UpdateJob &job, bool sync);
|
||||
|
||||
static Result<ChunkMetadata> queryChunk(chunk_engine::Engine &engine, const ChunkId &chunkId, ChainId chainId) {
|
||||
std::string key;
|
||||
key.reserve(sizeof(chainId) + chunkId.data().size());
|
||||
key.append((const char *)&chainId, sizeof(chainId));
|
||||
key.append(chunkId.data());
|
||||
|
||||
std::string error;
|
||||
auto chunk = engine.get_raw_chunk(toSlice(key), error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
if (chunk == nullptr) {
|
||||
return makeError(StorageCode::kChunkMetadataNotFound);
|
||||
}
|
||||
|
||||
ChunkMetadata out;
|
||||
copyMeta(chunk->raw_meta(), out);
|
||||
|
||||
engine.release_raw_chunk(chunk);
|
||||
return out;
|
||||
}
|
||||
|
||||
static Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(chunk_engine::Engine &engine,
|
||||
const ChunkIdRange &chunkIdRange,
|
||||
ChainId chainId) {
|
||||
const auto &beginChunkId = chunkIdRange.begin;
|
||||
std::string beginKey;
|
||||
beginKey.reserve(sizeof(chainId) + beginChunkId.data().size());
|
||||
beginKey.append((const char *)&chainId, sizeof(chainId));
|
||||
beginKey.append(beginChunkId.data());
|
||||
|
||||
const auto &endChunkId = chunkIdRange.end;
|
||||
std::string endKey;
|
||||
endKey.reserve(sizeof(chainId) + endChunkId.data().size());
|
||||
endKey.append((const char *)&chainId, sizeof(chainId));
|
||||
endKey.append(endChunkId.data());
|
||||
|
||||
std::string error;
|
||||
auto chunks =
|
||||
engine.query_raw_chunks(toSlice(beginKey), toSlice(endKey), chunkIdRange.maxNumChunkIdsToProcess, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
auto len = chunks->len();
|
||||
std::vector<std::pair<ChunkId, ChunkMetadata>> out;
|
||||
out.reserve(len);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
auto chunkId = chunks->chunk_id(i);
|
||||
auto &in = chunks->chunk_meta(i);
|
||||
out.emplace_back();
|
||||
out.back().first =
|
||||
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
|
||||
copyMeta(in, out.back().second);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static Result<std::vector<ChunkId>> queryUncommittedChunks(chunk_engine::Engine &engine, ChainId chainId) {
|
||||
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
|
||||
|
||||
std::string error;
|
||||
auto chunks = engine.query_uncommitted_raw_chunks(prefix, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
auto len = chunks->len();
|
||||
std::vector<ChunkId> out;
|
||||
out.reserve(len);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
auto chunkId = chunks->chunk_id(i);
|
||||
out.push_back(ChunkId(
|
||||
std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)}));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static Result<Void> resetUncommittedChunks(chunk_engine::Engine &engine, ChainId chainId, ChainVer chainVer) {
|
||||
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
|
||||
|
||||
std::string error;
|
||||
auto chunks = engine.handle_uncommitted_raw_chunks(prefix, chainVer, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
XLOGF(CRITICAL, "reset uncommitted chunks failed: {}, chain {}", error, chainId);
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
auto len = chunks->len();
|
||||
XLOGF_IF(CRITICAL, len > 0, "reset uncommitted chunks succ, chain: {}, size: {}", chainId, len);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
auto chunkId = chunks->chunk_id(i);
|
||||
auto id =
|
||||
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
|
||||
auto &in = chunks->chunk_meta(i);
|
||||
ChunkMetadata meta{};
|
||||
copyMeta(in, meta);
|
||||
XLOGF(CRITICAL, "reset uncommitted chain {} chunk {} meta {}", chainId, id, meta);
|
||||
}
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static Result<Void> removeAllChunks(chunk_engine::Engine &engine, ChainId chainId) {
|
||||
std::string key;
|
||||
key.reserve(sizeof(chainId));
|
||||
key.append((const char *)&chainId, sizeof(chainId));
|
||||
|
||||
std::string error;
|
||||
engine.raw_batch_remove(toSlice(key), toSlice(key), std::numeric_limits<uint64_t>::max(), error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataSetError, std::move(error));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static Result<Void> getAllMetadata(chunk_engine::Engine &engine, ChainId chainId, ChunkMetaVector &metadataVec) {
|
||||
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
|
||||
std::string error;
|
||||
auto chunks = engine.query_all_raw_chunks(prefix, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
auto len = chunks->len();
|
||||
metadataVec.reserve(len);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
auto chunkId = chunks->chunk_id(i);
|
||||
auto &in = chunks->chunk_meta(i);
|
||||
|
||||
metadataVec.emplace_back();
|
||||
auto &out = metadataVec.back();
|
||||
|
||||
out.chunkId =
|
||||
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
|
||||
out.updateVer = ChunkVer{in.chunk_ver};
|
||||
out.commitVer = ChunkVer{in.chunk_ver};
|
||||
out.chainVer = ChainVer{in.chain_ver};
|
||||
out.chunkState = ChunkState::COMMIT;
|
||||
out.checksum = ChecksumInfo{ChecksumType::CRC32C, ~in.checksum};
|
||||
out.length = in.len;
|
||||
if (chunks->chunk_uncommitted(i)) {
|
||||
out.commitVer = ChunkVer{out.commitVer - 1};
|
||||
out.chunkState = ChunkState::CLEAN;
|
||||
}
|
||||
}
|
||||
std::sort(metadataVec.begin(), metadataVec.end(), [](auto &a, auto &b) { return a.chunkId > b.chunkId; });
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static Result<Void> getAllMetadataMap(chunk_engine::Engine &engine,
|
||||
std::unordered_map<ChunkId, ChunkMetadata> &metas,
|
||||
ChainId chainId) {
|
||||
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
|
||||
std::string error;
|
||||
auto chunks = engine.query_all_raw_chunks(prefix, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
|
||||
}
|
||||
|
||||
auto len = chunks->len();
|
||||
metas.reserve(metas.size() + len);
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
auto chunkId = chunks->chunk_id(i);
|
||||
auto &meta = metas[ChunkId(
|
||||
std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)})];
|
||||
copyMeta(chunks->chunk_meta(i), meta);
|
||||
if (chunks->chunk_uncommitted(i)) {
|
||||
meta.commitVer = ChunkVer{meta.commitVer - 1};
|
||||
meta.chunkState = ChunkState::CLEAN;
|
||||
}
|
||||
}
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static uint64_t chainUsedSize(chunk_engine::Engine &engine, ChainId chainId) {
|
||||
rust::Slice<const uint8_t> slice{(const uint8_t *)&chainId, sizeof(chainId)};
|
||||
std::string error;
|
||||
auto size = engine.query_raw_used_size(slice, error);
|
||||
if (UNLIKELY(!error.empty())) {
|
||||
XLOGF(ERR, "query chunk engine chain used size error: chain {} error {}", chainId, error);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
152
src/storage/store/ChunkFileStore.cc
Normal file
152
src/storage/store/ChunkFileStore.cc
Normal file
@@ -0,0 +1,152 @@
|
||||
#include "storage/store/ChunkFileStore.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
#include <limits>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder punchHoleRecorder{"storage.punch_hole"};
|
||||
monitor::OperationRecorder allocateSpaceRecorder{"storage.allocate_space"};
|
||||
|
||||
} // namespace
|
||||
|
||||
Result<Void> ChunkFileStore::create(const PhysicalConfig &config) {
|
||||
path_ = config.path;
|
||||
physicalFileCount_ = config.physical_file_count;
|
||||
for (auto &chunkSize : config.chunk_size_list) {
|
||||
RETURN_AND_LOG_ON_ERROR(createInnerFile(chunkSize));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkFileStore::load(const PhysicalConfig &config) {
|
||||
path_ = config.path;
|
||||
physicalFileCount_ = config.physical_file_count;
|
||||
for (uint32_t chunkSize : config.chunk_size_list) {
|
||||
if (config_.preopen_chunk_size_list().contains(chunkSize)) {
|
||||
for (auto i = 0u; i < physicalFileCount_; ++i) {
|
||||
RETURN_AND_LOG_ON_ERROR(openInnerFile(ChunkFileId{chunkSize, i}));
|
||||
}
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkFileStore::addChunkSize(const std::vector<Size> &sizeList) {
|
||||
for (auto &chunkSize : sizeList) {
|
||||
RETURN_AND_LOG_ON_ERROR(createInnerFile(chunkSize));
|
||||
XLOGF(WARNING, "chunk inner files are created, path {}, size {}", path_, chunkSize);
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<ChunkFileView> ChunkFileStore::open(ChunkFileId fileId) {
|
||||
auto openResult = openInnerFile(fileId);
|
||||
if (UNLIKELY(!openResult)) {
|
||||
XLOGF(ERR, "open file {} failed: {}", fileId, openResult.error());
|
||||
return makeError(std::move(openResult.error()));
|
||||
}
|
||||
auto &innerFile = **openResult;
|
||||
|
||||
ChunkFileView file;
|
||||
file.normal_ = innerFile.normal_;
|
||||
file.direct_ = innerFile.direct_;
|
||||
file.index_ = innerFile.index_;
|
||||
return file;
|
||||
}
|
||||
|
||||
Result<Void> ChunkFileStore::punchHole(ChunkFileId fileId, size_t offset) {
|
||||
auto recordGuard = punchHoleRecorder.record();
|
||||
|
||||
auto openResult = openInnerFile(fileId);
|
||||
if (UNLIKELY(!openResult)) {
|
||||
XLOGF(ERR, "open file {} failed: {}", fileId, openResult.error());
|
||||
return makeError(std::move(openResult.error()));
|
||||
}
|
||||
auto &innerFile = **openResult;
|
||||
|
||||
int ret = ::fallocate(innerFile.direct_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, fileId.chunkSize);
|
||||
XLOGF(DBG, "punch hole {}, offset {}", fileId, Size::toString(offset));
|
||||
if (UNLIKELY(ret == -1)) {
|
||||
XLOGF(ERR, "punch hole to {} failed: {}", fileId, errno);
|
||||
return makeError(StorageCode::kPunchHoleFailed, fmt::format("punch hole to {} failed: {}", fileId, errno));
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkFileStore::allocate(ChunkFileId fileId, size_t offset, size_t size) {
|
||||
auto recordGuard = allocateSpaceRecorder.record();
|
||||
|
||||
auto openResult = openInnerFile(fileId);
|
||||
RETURN_AND_LOG_ON_ERROR(openResult);
|
||||
auto &innerFile = **openResult;
|
||||
|
||||
int ret = ::fallocate(innerFile.direct_, 0, offset, size);
|
||||
XLOGF(DBG, "allocate {}, offset {}, size {}", fileId, Size::toString(offset), Size::toString(size));
|
||||
if (UNLIKELY(ret == -1)) {
|
||||
auto msg = fmt::format("allocate to {} failed: {}", fileId, errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kPunchHoleFailed, std::move(msg));
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<FileDescriptor *> ChunkFileStore::openInnerFile(ChunkFileId fileId, bool createFile /* = false */) {
|
||||
// 1. try to find in TLS cache.
|
||||
auto &cache = (*tlsCache_)[fileId];
|
||||
if (LIKELY(cache != nullptr)) {
|
||||
return cache;
|
||||
}
|
||||
|
||||
Path filePath = path_ / Size::toString(fileId.chunkSize) / fmt::format("{:02X}", fileId.chunkIdx);
|
||||
auto openResult = globalFileStore_.open(filePath, createFile);
|
||||
RETURN_AND_LOG_ON_ERROR(openResult);
|
||||
cache = *openResult;
|
||||
return openResult;
|
||||
}
|
||||
|
||||
Result<Void> ChunkFileStore::createInnerFile(Size chunkSize) {
|
||||
if (chunkSize < kAIOAlignSize) {
|
||||
auto msg = fmt::format("chunk size too small: {}", chunkSize);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
|
||||
}
|
||||
if (chunkSize % kAIOAlignSize) {
|
||||
auto msg = fmt::format("chunk size not aligned: {}", chunkSize);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
|
||||
}
|
||||
if (chunkSize > kMaxChunkSize) {
|
||||
auto msg = fmt::format("chunk size too large: {}", chunkSize);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
|
||||
}
|
||||
auto dirPath = path_ / Size::toString(chunkSize);
|
||||
boost::system::error_code ec{};
|
||||
boost::filesystem::create_directories(dirPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
XLOGF(ERR, "chunk store create directory {} failed: {}", dirPath.string(), ec.message());
|
||||
return makeError(StorageCode::kChunkOpenFailed,
|
||||
fmt::format("chunk store create directory {} failed: {}", dirPath.string(), ec.message()));
|
||||
}
|
||||
|
||||
for (auto i = 0u; i < physicalFileCount_; ++i) {
|
||||
ChunkFileId fileId;
|
||||
fileId.chunkSize = chunkSize;
|
||||
fileId.chunkIdx = i;
|
||||
RETURN_AND_LOG_ON_ERROR(openInnerFile(fileId, true));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
65
src/storage/store/ChunkFileStore.h
Normal file
65
src/storage/store/ChunkFileStore.h
Normal file
@@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/ThreadLocal.h>
|
||||
#include <mutex>
|
||||
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/FdWrapper.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "common/utils/Shards.h"
|
||||
#include "storage/store/ChunkFileView.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
#include "storage/store/GlobalFileStore.h"
|
||||
#include "storage/store/PhysicalConfig.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ChunkFileStore {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(preopen_chunk_size_list, std::set<Size>{});
|
||||
};
|
||||
|
||||
ChunkFileStore(const Config &config, GlobalFileStore &globalFileStore)
|
||||
: config_(config),
|
||||
globalFileStore_(globalFileStore) {}
|
||||
|
||||
// create file store.
|
||||
Result<Void> create(const PhysicalConfig &config);
|
||||
|
||||
// load file store.
|
||||
Result<Void> load(const PhysicalConfig &config);
|
||||
|
||||
// add new chunk size.
|
||||
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
|
||||
|
||||
// get a chunk file. [thread-safe]
|
||||
Result<ChunkFileView> open(ChunkFileId fileId);
|
||||
|
||||
// recycle a chunk. [thread-safe]
|
||||
Result<Void> punchHole(ChunkFileId fileId, size_t offset);
|
||||
|
||||
// allocate space. [thread-safe]
|
||||
Result<Void> allocate(ChunkFileId fileId, size_t offset, size_t size);
|
||||
|
||||
protected:
|
||||
// open inner file. [thread-safe]
|
||||
Result<FileDescriptor *> openInnerFile(ChunkFileId fileId, bool createFile = false);
|
||||
|
||||
// create inner file.
|
||||
Result<Void> createInnerFile(Size chunkSize);
|
||||
|
||||
private:
|
||||
const Config &config_;
|
||||
GlobalFileStore &globalFileStore_;
|
||||
|
||||
Path path_;
|
||||
uint32_t physicalFileCount_{};
|
||||
|
||||
constexpr static auto kShardsNum = 64u;
|
||||
folly::ThreadLocal<robin_hood::unordered_map<ChunkFileId, FileDescriptor *>> tlsCache_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
127
src/storage/store/ChunkFileView.cc
Normal file
127
src/storage/store/ChunkFileView.cc
Normal file
@@ -0,0 +1,127 @@
|
||||
#include "storage/store/ChunkFileView.h"
|
||||
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/ExponentialBackoffRetry.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::OperationRecorder storageReadRecord{"storage.pread"};
|
||||
monitor::OperationRecorder storageWriteRecord{"storage.pwrite"};
|
||||
monitor::DistributionRecorder storageWriteSize{"storage.pwrite.size"};
|
||||
monitor::CountRecorder storageWriteDirect{"storage.pwrite.direct"};
|
||||
|
||||
Result<uint32_t> ChunkFileView::read(uint8_t *buf, size_t size, size_t offset, bool direct /* = false */) const {
|
||||
auto recordGuard = storageReadRecord.record();
|
||||
|
||||
int fd = direct ? direct_ : normal_;
|
||||
uint32_t r = 0;
|
||||
while (size > 0) {
|
||||
int ret = ::pread(fd, buf, size, offset);
|
||||
if (LIKELY(ret > 0)) {
|
||||
r += ret;
|
||||
buf += ret;
|
||||
size -= ret;
|
||||
offset += ret;
|
||||
} else if (ret == 0) {
|
||||
break;
|
||||
} else {
|
||||
auto msg = fmt::format("read chunk file failed: fd {}, offset {}, errno {}", fd, offset, errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkReadFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
return r;
|
||||
}
|
||||
|
||||
Result<uint32_t> ChunkFileView::write(const uint8_t *buf, size_t size, size_t offset, const ChunkMetadata &meta) {
|
||||
auto recordGuard = storageWriteRecord.record();
|
||||
storageWriteSize.addSample(size);
|
||||
if (UNLIKELY(size + offset > meta.innerFileId.chunkSize)) {
|
||||
auto msg = fmt::format("chunk write exceed chunk size, meta {}, size {}, offset {}", meta, size, offset);
|
||||
reportFatalEvent();
|
||||
XLOG(DFATAL, msg);
|
||||
return makeError(StatusCode::kInvalidArg, std::move(msg));
|
||||
}
|
||||
offset += meta.innerOffset;
|
||||
int fd = normal_;
|
||||
if (size % kAIOAlignSize == 0 && offset % kAIOAlignSize == 0 &&
|
||||
reinterpret_cast<uint64_t>(buf) % kAIOAlignSize == 0) {
|
||||
fd = direct_;
|
||||
storageWriteDirect.addSample(1);
|
||||
XLOGF(DBG, "use direct fd for write: fd {}, size {}, offset {}", fd, size, offset);
|
||||
}
|
||||
uint32_t w = 0;
|
||||
ExponentialBackoffRetry retry(100_ms, 5_s, 30_s);
|
||||
while (size > 0) {
|
||||
int ret = ::pwrite(fd, buf, size, offset);
|
||||
if (LIKELY(ret > 0)) {
|
||||
w += ret;
|
||||
buf += ret;
|
||||
size -= ret;
|
||||
offset += ret;
|
||||
} else {
|
||||
auto msg = fmt::format("write chunk file failed: fd {}, direct {}, buf {}, offset {}, size {}, ret {}, errno {}",
|
||||
fd,
|
||||
fd == direct_,
|
||||
fmt::ptr(buf),
|
||||
offset,
|
||||
size,
|
||||
ret,
|
||||
errno);
|
||||
XLOG(ERR, msg);
|
||||
auto waitTime = retry.getWaitTime();
|
||||
if (waitTime.count() == 0) {
|
||||
return makeError(StorageCode::kChunkWriteFailed, std::move(msg));
|
||||
}
|
||||
std::this_thread::sleep_for(waitTime);
|
||||
}
|
||||
}
|
||||
recordGuard.succ();
|
||||
return w;
|
||||
}
|
||||
|
||||
Result<ChecksumInfo> ChunkFileView::checksum(ChecksumType type, size_t size, size_t offset, const ChunkMetadata &meta) {
|
||||
if (UNLIKELY(size + offset > meta.innerFileId.chunkSize)) {
|
||||
auto msg = fmt::format("chunk write exceed chunk size, meta {}, size {}, offset {}", meta, size, offset);
|
||||
reportFatalEvent();
|
||||
XLOG(DFATAL, msg);
|
||||
return makeError(StatusCode::kInvalidArg, std::move(msg));
|
||||
}
|
||||
offset += meta.innerOffset;
|
||||
ChunkDataIterator iter(*this, size, offset);
|
||||
auto checksum = ChecksumInfo::create(type, &iter, size);
|
||||
if (checksum.type == ChecksumType::NONE) return makeError(StorageCode::kChunkReadFailed);
|
||||
return checksum;
|
||||
}
|
||||
|
||||
std::pair<const uint8_t *, size_t> ChunkDataIterator::next() {
|
||||
if (length_ == 0) return {nullptr, 0};
|
||||
|
||||
size_t readSize = std::min(length_, ChecksumInfo::kChunkSize);
|
||||
bool directIO = readSize % kAIOAlignSize == 0 && offset_ % kAIOAlignSize == 0;
|
||||
auto readRes = chunkFile_.read(data_, readSize, offset_, directIO);
|
||||
|
||||
if (!readRes) {
|
||||
XLOGF(ERR, "Cannot calculate checksum since read failed, error: {}", readRes);
|
||||
return {nullptr, 0};
|
||||
} else if (*readRes != readSize) {
|
||||
XLOGF(ERR, "Cannot calculate checksum since read size {} not equal to requested size {}", *readRes, readSize);
|
||||
return {nullptr, 0};
|
||||
}
|
||||
|
||||
offset_ += readSize;
|
||||
length_ -= readSize;
|
||||
|
||||
return {data_, readSize};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
53
src/storage/store/ChunkFileView.h
Normal file
53
src/storage/store/ChunkFileView.h
Normal file
@@ -0,0 +1,53 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/Range.h>
|
||||
|
||||
#include "common/utils/Result.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ChunkFileView {
|
||||
public:
|
||||
// read a piece of data.
|
||||
Result<uint32_t> read(uint8_t *buf, size_t size, size_t offset, bool direct = false) const;
|
||||
|
||||
// write a piece of data.
|
||||
Result<uint32_t> write(const uint8_t *buf, size_t size, size_t offset, const ChunkMetadata &meta);
|
||||
|
||||
// calculate the chunk checksum
|
||||
Result<ChecksumInfo> checksum(ChecksumType type, size_t size, size_t offset, const ChunkMetadata &meta);
|
||||
|
||||
// get direct fd for aio read.
|
||||
int directFD() const { return direct_; }
|
||||
|
||||
// get fd index in list.
|
||||
auto &index() const { return index_; }
|
||||
|
||||
private:
|
||||
friend class ChunkFileStore;
|
||||
int normal_;
|
||||
int direct_;
|
||||
std::optional<uint32_t> index_{};
|
||||
};
|
||||
|
||||
class ChunkDataIterator : public ChecksumInfo::DataIterator {
|
||||
public:
|
||||
ChunkDataIterator(ChunkFileView &chunkFile, size_t length, size_t offset)
|
||||
: data_((uint8_t *)memory::memalign(kAIOAlignSize, ChecksumInfo::kChunkSize)),
|
||||
chunkFile_(chunkFile),
|
||||
length_(length),
|
||||
offset_(offset) {}
|
||||
|
||||
~ChunkDataIterator() override { memory::deallocate(data_); }
|
||||
|
||||
std::pair<const uint8_t *, size_t> next() override;
|
||||
|
||||
private:
|
||||
uint8_t *data_;
|
||||
ChunkFileView &chunkFile_;
|
||||
size_t length_;
|
||||
size_t offset_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
1055
src/storage/store/ChunkMetaStore.cc
Normal file
1055
src/storage/store/ChunkMetaStore.cc
Normal file
File diff suppressed because it is too large
Load Diff
161
src/storage/store/ChunkMetaStore.h
Normal file
161
src/storage/store/ChunkMetaStore.h
Normal file
@@ -0,0 +1,161 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/AtomicUnorderedMap.h>
|
||||
#include <memory>
|
||||
#include <queue>
|
||||
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "kv/KVStore.h"
|
||||
#include "storage/store/ChunkFileStore.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
#include "storage/store/PhysicalConfig.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ChunkMetaStore {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(allocate_size, 256_MB, [](Size s) { return s && s % kMaxChunkSize == 0; });
|
||||
CONFIG_HOT_UPDATED_ITEM(recycle_batch_size, 256u, ConfigCheckers::checkPositive);
|
||||
CONFIG_HOT_UPDATED_ITEM(punch_hole_batch_size, 16u, ConfigCheckers::checkPositive);
|
||||
CONFIG_HOT_UPDATED_ITEM(removed_chunk_expiration_time, 3_d);
|
||||
CONFIG_HOT_UPDATED_ITEM(removed_chunk_force_recycled_time, 1_h);
|
||||
};
|
||||
|
||||
ChunkMetaStore(const Config &config, ChunkFileStore &fileStore)
|
||||
: config_(config),
|
||||
fileStore_(fileStore),
|
||||
allocateState_(16) {}
|
||||
|
||||
~ChunkMetaStore();
|
||||
|
||||
// create chunk meta store.
|
||||
Result<Void> create(const kv::KVStore::Config &config, const PhysicalConfig &targetConfig);
|
||||
|
||||
// load chunk meta store.
|
||||
Result<Void> load(const kv::KVStore::Config &config,
|
||||
const PhysicalConfig &targetConfig,
|
||||
bool createIfMissing = false);
|
||||
|
||||
// add new chunk size.
|
||||
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
|
||||
|
||||
// migrate chunk meta store.
|
||||
Result<Void> migrate(const kv::KVStore::Config &config, const PhysicalConfig &targetConfig);
|
||||
|
||||
// get metadata of chunk. [thread-safe]
|
||||
Result<Void> get(const ChunkId &chunkId, ChunkMetadata &meta);
|
||||
|
||||
// set metadata of chunk. [thread-safe]
|
||||
Result<Void> set(const ChunkId &chunkId, const ChunkMetadata &meta);
|
||||
|
||||
// remove metadata of chunk. [thread-safe]
|
||||
Result<Void> remove(const ChunkId &chunkId, const ChunkMetadata &meta);
|
||||
|
||||
// create a chunk. [thread-safe]
|
||||
Result<Void> createChunk(const ChunkId &chunkId,
|
||||
ChunkMetadata &meta,
|
||||
uint32_t chunkSize,
|
||||
folly::CPUThreadPoolExecutor &executor,
|
||||
bool allowToAllocate);
|
||||
|
||||
// recycle a batch of chunks, return true if has more. [thread-safe]
|
||||
Result<bool> punchHole();
|
||||
|
||||
// sync the LOG of kv.
|
||||
Result<Void> sync();
|
||||
|
||||
// get used size.
|
||||
uint64_t usedSize() const { return std::max(int64_t(createdSize_.load() - removedSize_.load()), 0l); }
|
||||
|
||||
// get reserved and unrecycled size.
|
||||
Result<Void> unusedSize(int64_t &reservedSize, int64_t &unrecycledSize);
|
||||
|
||||
// get all uncommitted chunk ids.
|
||||
auto &uncommitted() { return uncommitted_; }
|
||||
|
||||
// enable or disable emergency recycling.
|
||||
void setEmergencyRecycling(bool enable) { emergencyRecycling_ = enable; }
|
||||
|
||||
// iterator.
|
||||
class Iterator {
|
||||
public:
|
||||
explicit Iterator(kv::KVStore::IteratorPtr it, std::string_view chunkIdPrefix);
|
||||
// seek a chunk id prefix.
|
||||
void seek(std::string_view chunkIdPrefix);
|
||||
// return valid or not.
|
||||
bool valid() const;
|
||||
// get current chunk id.
|
||||
ChunkId chunkId() const;
|
||||
// get current metadata.
|
||||
Result<ChunkMetadata> meta() const;
|
||||
// next metadata.
|
||||
void next();
|
||||
// check status.
|
||||
Result<Void> status() const;
|
||||
|
||||
private:
|
||||
kv::KVStore::IteratorPtr it_;
|
||||
};
|
||||
Result<Iterator> iterator(std::string_view chunkIdPrefix = {});
|
||||
|
||||
protected:
|
||||
Result<Void> checkSentinel(std::string_view key);
|
||||
|
||||
Result<Void> getSize(std::string_view key, std::atomic<uint64_t> &size);
|
||||
|
||||
struct AllocateState {
|
||||
std::mutex createMutex;
|
||||
std::mutex recycleMutex;
|
||||
std::mutex allocateMutex;
|
||||
std::atomic<bool> loaded{};
|
||||
std::atomic<bool> allocating{};
|
||||
std::atomic<bool> recycling{};
|
||||
uint32_t chunkSize{};
|
||||
uint32_t allocateIndex{}; // createMutex.
|
||||
std::atomic<uint64_t> startingPoint{}; // createMutex.
|
||||
std::atomic<uint64_t> createdCount{}; // createMutex.
|
||||
std::atomic<uint64_t> usedCount{}; // createMutex.
|
||||
std::atomic<uint64_t> removedCount{};
|
||||
std::atomic<uint64_t> recycledCount{}; // recycleMutex
|
||||
std::atomic<uint64_t> reusedCount{}; // createMutex
|
||||
std::atomic<uint64_t> holeCount{}; // recycleMutex
|
||||
std::atomic<UtcTime> oldestRemovedTimestamp{}; // recycleMutex
|
||||
std::vector<ChunkPosition> createdChunks; // createMutex
|
||||
std::vector<ChunkPosition> recycledChunks; // createMutex
|
||||
robin_hood::unordered_map<uint32_t, size_t> fileSize; // createMutex
|
||||
};
|
||||
void createAllocateState(uint32_t chunkSize);
|
||||
|
||||
Result<AllocateState *> loadAllocateState(uint32_t chunkSize);
|
||||
|
||||
Result<Void> allocateChunks(AllocateState &state, bool withLock = false);
|
||||
|
||||
bool needRecycleRemovedChunks(AllocateState &state);
|
||||
|
||||
Result<Void> recycleRemovedChunks(AllocateState &state, bool withLock = false);
|
||||
|
||||
Result<bool> punchHoleRemovedChunks(AllocateState &state, uint64_t expirationUs);
|
||||
|
||||
private:
|
||||
const Config &config_;
|
||||
ChunkFileStore &fileStore_;
|
||||
|
||||
std::unique_ptr<kv::KVStore> kv_;
|
||||
std::string sentinel_;
|
||||
std::string kvName_;
|
||||
bool hasSentinel_ = false;
|
||||
uint32_t physicalFileCount_ = 256;
|
||||
|
||||
std::atomic<uint64_t> createdSize_ = 0;
|
||||
std::atomic<uint64_t> removedSize_ = 0;
|
||||
std::vector<ChunkId> uncommitted_;
|
||||
|
||||
std::atomic<bool> emergencyRecycling_ = false;
|
||||
|
||||
folly::AtomicUnorderedInsertMap<uint32_t, std::unique_ptr<AllocateState>> allocateState_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
14
src/storage/store/ChunkMetadata.cc
Normal file
14
src/storage/store/ChunkMetadata.cc
Normal file
@@ -0,0 +1,14 @@
|
||||
#include "ChunkMetadata.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::CountRecorder fatalEvent{"storage.fatal"};
|
||||
|
||||
} // namespace
|
||||
|
||||
void reportFatalEvent() { fatalEvent.addSample(1); }
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
43
src/storage/store/ChunkMetadata.h
Normal file
43
src/storage/store/ChunkMetadata.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <bit>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <folly/Hash.h>
|
||||
#include <folly/lang/Bits.h>
|
||||
#include <string>
|
||||
|
||||
#include "common/serde/BigEndian.h"
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/Int128.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/Size.h"
|
||||
#include "common/utils/StrongType.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/store/ChunkFileView.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
inline constexpr auto kMaxChunkSize = 64_MB;
|
||||
|
||||
struct ChunkInfo {
|
||||
ChunkMetadata meta;
|
||||
ChunkFileView view;
|
||||
};
|
||||
|
||||
struct ChunkPosition {
|
||||
SERDE_STRUCT_FIELD(fileIdx, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(offset, serde::BigEndian<std::size_t>{});
|
||||
};
|
||||
|
||||
void reportFatalEvent();
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
|
||||
template <>
|
||||
struct ::std::hash<hf3fs::storage::ChunkFileId> {
|
||||
size_t operator()(hf3fs::storage::ChunkFileId id) const {
|
||||
return folly::hash::twang_mix64(reinterpret_cast<uint64_t &>(id));
|
||||
}
|
||||
};
|
||||
469
src/storage/store/ChunkReplica.cc
Normal file
469
src/storage/store/ChunkReplica.cc
Normal file
@@ -0,0 +1,469 @@
|
||||
#include "storage/store/ChunkReplica.h"
|
||||
|
||||
#include <folly/Random.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/FileUtils.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/SysResource.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "scn/scan/scan.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
std::array<uint8_t, kMaxChunkSize> kZeroBytes{};
|
||||
|
||||
monitor::OperationRecorder storageAioReadRecorder{"storage.chunk_read"};
|
||||
monitor::CountRecorder storageReadUncommitted{"storage.chunk_read.uncommitted"};
|
||||
|
||||
monitor::OperationRecorder storageUpdateRecorder{"storage.chunk_update"};
|
||||
monitor::CountRecorder storageUpdateSyncSmallerVersion{"storage.chunk_update.sync_smaller_version"};
|
||||
monitor::CountRecorder storageUpdateChecksumReadChunk{"storage.chunk_update.checksum_read_chunk"};
|
||||
monitor::CountRecorder storageUpdateChecksumCombine{"storage.chunk_update.checksum_combine"};
|
||||
monitor::CountRecorder storageUpdateChecksumReuse{"storage.chunk_update.checksum_reuse"};
|
||||
monitor::CountRecorder storageUpdateChecksumNone{"storage.chunk_update.checksum_none"};
|
||||
monitor::CountRecorder storageUpdateSeqWrite{"storage.chunk_update.seq_write"};
|
||||
|
||||
monitor::OperationRecorder storageCommitRecorder{"storage.chunk_commit"};
|
||||
monitor::CountRecorder storageCommitDirty{"storage.chunk_commit.dirty"};
|
||||
monitor::CountRecorder storageCommitStale{"storage.chunk_commit.stale"};
|
||||
|
||||
} // namespace
|
||||
|
||||
// prepare aio read.
|
||||
Result<Void> ChunkReplica::aioPrepareRead(ChunkStore &store, AioReadJob &job) {
|
||||
auto recordGuard = storageAioReadRecorder.record();
|
||||
|
||||
auto &result = job.result();
|
||||
auto &state = job.state();
|
||||
const auto &chunkId = job.readIO().key.chunkId;
|
||||
|
||||
// 1. get meta info.
|
||||
auto metaResult = store.get(chunkId);
|
||||
if (UNLIKELY(metaResult.hasError())) {
|
||||
XLOGF(INFO, "{}", metaResult.error());
|
||||
RETURN_ERROR(metaResult);
|
||||
}
|
||||
auto &chunkInfo = (*metaResult)->second;
|
||||
const ChunkMetadata &meta = chunkInfo.meta;
|
||||
|
||||
// 2. check meta info.
|
||||
result.commitVer = meta.commitVer;
|
||||
result.updateVer = meta.updateVer;
|
||||
result.commitChainVer = meta.chainVer;
|
||||
state.chunkLen = meta.size;
|
||||
state.chunkChecksum = meta.checksum();
|
||||
|
||||
if (UNLIKELY(result.commitVer != result.updateVer && !state.readUncommitted)) {
|
||||
auto msg = fmt::format("chunk {} {} version mismatch {} != {}", chunkId, meta, result.commitVer, result.updateVer);
|
||||
XLOG(ERR, msg);
|
||||
storageReadUncommitted.addSample(1);
|
||||
return makeError(StorageCode::kChunkNotCommit, std::move(msg));
|
||||
}
|
||||
|
||||
// 3. prepare aio read.
|
||||
state.readLength = job.alignedLength();
|
||||
state.readFd = chunkInfo.view.directFD();
|
||||
state.fdIndex = chunkInfo.view.index();
|
||||
state.readOffset = meta.innerOffset + job.alignedOffset();
|
||||
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// finish aio read.
|
||||
Result<Void> ChunkReplica::aioFinishRead(ChunkStore &store, AioReadJob &job) {
|
||||
auto &readIO = job.readIO();
|
||||
auto &result = job.result();
|
||||
const auto &chunkId = readIO.key.chunkId;
|
||||
|
||||
// 1. get meta info.
|
||||
auto metaResult = store.get(chunkId);
|
||||
if (UNLIKELY(!metaResult)) {
|
||||
return makeError(std::move(metaResult.error()));
|
||||
}
|
||||
auto &chunkInfo = (*metaResult)->second;
|
||||
const ChunkMetadata &meta = chunkInfo.meta;
|
||||
|
||||
// 2. check meta info.
|
||||
if (UNLIKELY(result.updateVer != meta.updateVer)) {
|
||||
auto msg = fmt::format("chunk {} {} version outdated {} != {}", chunkId, meta, result.updateVer, meta.updateVer);
|
||||
XLOG(ERR, msg);
|
||||
storageReadUncommitted.addSample(1);
|
||||
return makeError(StorageCode::kChunkNotCommit, std::move(msg));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
static Result<uint32_t> doRealWrite(const ChunkId &chunkId,
|
||||
ChunkInfo &chunkInfo,
|
||||
const uint8_t *writeData,
|
||||
uint32_t writeSize,
|
||||
uint32_t writeOffset) {
|
||||
#ifndef NDEBUG
|
||||
// For debug and unittest.
|
||||
static auto flagPath = Path{fmt::format("/tmp/storage_main_write_failed.{}", SysResource::pid())};
|
||||
auto checkResult = loadFile(flagPath);
|
||||
uint32_t writeErrorPercent = 0;
|
||||
if (checkResult && scn::scan(*checkResult, "{}", writeErrorPercent)) {
|
||||
if (folly::Random::rand32(100) < writeErrorPercent) {
|
||||
auto msg = fmt::format("chunk replica write error for unittest");
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkWriteFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
ChunkMetadata &meta = chunkInfo.meta;
|
||||
auto writeResult = chunkInfo.view.write(writeData, writeSize, writeOffset, meta);
|
||||
if (LIKELY(bool(writeResult))) {
|
||||
meta.size = std::max(uint32_t(meta.size), writeOffset + writeResult.value());
|
||||
} else {
|
||||
XLOGF(ERR, "chunk replica {} {} write error {}", chunkId, meta, writeResult.error());
|
||||
}
|
||||
return writeResult;
|
||||
}
|
||||
|
||||
// do write.
|
||||
Result<uint32_t> ChunkReplica::update(ChunkStore &store, UpdateJob &job, folly::CPUThreadPoolExecutor &executor) {
|
||||
auto recordGuard = storageUpdateRecorder.record();
|
||||
|
||||
const auto &writeIO = job.updateIO();
|
||||
const auto &options = job.options();
|
||||
const auto &chunkId = writeIO.key.chunkId;
|
||||
const auto &state = job.state();
|
||||
auto &result = job.result();
|
||||
|
||||
if (UNLIKELY(!writeIO.isRemove() &&
|
||||
(writeIO.offset >= writeIO.chunkSize || writeIO.offset + writeIO.length > writeIO.chunkSize))) {
|
||||
auto msg = fmt::format("chunk {} write offset exceed chunk size {}", chunkId, writeIO);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StatusCode::kInvalidArg, std::move(msg));
|
||||
}
|
||||
|
||||
// 1. get meta info.
|
||||
ChunkInfo chunkInfo;
|
||||
bool needCreateChunk = false;
|
||||
auto metaResult = store.get(chunkId);
|
||||
if (metaResult) {
|
||||
chunkInfo = (*metaResult)->second;
|
||||
} else if (metaResult.error().code() == StorageCode::kChunkMetadataNotFound) {
|
||||
if (writeIO.isRemove()) {
|
||||
result.commitVer = result.updateVer = writeIO.updateVer;
|
||||
result.commitChainVer = job.commitChainVer();
|
||||
return 0;
|
||||
} else {
|
||||
needCreateChunk = true;
|
||||
chunkInfo.meta.chainVer = job.commitChainVer();
|
||||
chunkInfo.meta.chunkState = ChunkState::CLEAN;
|
||||
chunkInfo.meta.innerFileId.chunkSize = writeIO.chunkSize;
|
||||
}
|
||||
} else {
|
||||
RETURN_AND_LOG_ON_ERROR(metaResult);
|
||||
}
|
||||
ChunkMetadata &meta = chunkInfo.meta;
|
||||
|
||||
// 2. begin to write.
|
||||
auto chunkSize = writeIO.isRemove() ? meta.innerFileId.chunkSize : writeIO.chunkSize;
|
||||
result.commitVer = meta.commitVer;
|
||||
result.updateVer = meta.updateVer;
|
||||
result.checksum = meta.checksum();
|
||||
result.commitChainVer = meta.chainVer;
|
||||
if (UNLIKELY(meta.innerFileId.chunkSize != chunkSize)) {
|
||||
auto msg = fmt::format("chunk {} {} chunk size mismatch {}", chunkId, meta, chunkSize);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkSizeMismatch, std::move(msg));
|
||||
}
|
||||
if (UNLIKELY(meta.chunkState == ChunkState::DIRTY && !options.isSyncing)) {
|
||||
auto msg = fmt::format("chunk {} {} state not valid", chunkId, meta);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkNotClean, fmt::format("chunk {} {} state not valid", chunkId, meta));
|
||||
}
|
||||
if (job.commitChainVer() < meta.chainVer && meta.chunkState == ChunkState::COMMIT) {
|
||||
auto msg = fmt::format("chunk {} {} chain version mismatch {} {}", chunkId, meta, writeIO, options);
|
||||
reportFatalEvent();
|
||||
XLOG(DFATAL, msg);
|
||||
return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
|
||||
if (writeIO.checksum.type != ChecksumType::NONE && writeIO.length != 0) {
|
||||
auto checksum = ChecksumInfo::create(writeIO.checksum.type, state.data, writeIO.length);
|
||||
if (checksum != writeIO.checksum) {
|
||||
if (!job.requestCtx().debugFlags.faultInjectionEnabled()) {
|
||||
reportFatalEvent();
|
||||
}
|
||||
XLOGF_IF(DFATAL,
|
||||
!job.requestCtx().debugFlags.faultInjectionEnabled(),
|
||||
"Local checksum {} not equal to checksum {} generated by client, write io: {}",
|
||||
checksum,
|
||||
writeIO.checksum,
|
||||
writeIO);
|
||||
return makeError(StorageCode::kChecksumMismatch);
|
||||
}
|
||||
}
|
||||
|
||||
XLOGF(DBG, "chunk {} {} write begin", chunkId, meta);
|
||||
|
||||
if (options.isSyncing) {
|
||||
XLOGF(DBG9, "chunk {} {} sync write: {}", chunkId, meta, writeIO);
|
||||
meta.updateVer = writeIO.updateVer;
|
||||
meta.commitVer = ChunkVer{writeIO.updateVer - 1};
|
||||
meta.recycleState = RecycleState::NORMAL;
|
||||
} else if (writeIO.updateVer > 0) {
|
||||
if (writeIO.updateVer <= meta.commitVer) {
|
||||
auto msg = fmt::format("chunk {} {} committed update {} <= {}", chunkId, meta, writeIO.updateVer, meta.commitVer);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkCommittedUpdate, std::move(msg));
|
||||
} else if (writeIO.updateVer <= meta.updateVer) {
|
||||
auto msg = fmt::format("chunk {} {} stale update {} <= {}", chunkId, meta, writeIO.updateVer, meta.updateVer);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkStaleUpdate, std::move(msg));
|
||||
} else if (writeIO.updateVer > meta.updateVer + 1) {
|
||||
auto msg =
|
||||
fmt::format("chunk {} {} missing update {} > {} + 1", chunkId, meta, writeIO.updateVer, meta.updateVer);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkMissingUpdate, std::move(msg));
|
||||
}
|
||||
meta.updateVer = writeIO.updateVer;
|
||||
} else {
|
||||
meta.updateVer += 1;
|
||||
if (meta.updateVer > meta.commitVer + 1) {
|
||||
auto msg = fmt::format("chunk {} {} advance update {}", chunkId, meta, writeIO);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkAdvanceUpdate, std::move(msg));
|
||||
}
|
||||
}
|
||||
|
||||
meta.chunkState = ChunkState::DIRTY;
|
||||
meta.chainVer = job.commitChainVer();
|
||||
meta.lastRequestId = job.requestCtx().tag.requestId;
|
||||
meta.lastClientUuid = job.requestCtx().tag.clientId.uuid;
|
||||
meta.timestamp = UtcClock::now();
|
||||
const bool isAppendWrite = writeIO.offset == meta.size;
|
||||
const bool skipPersist = (writeIO.isWrite() && isAppendWrite) || writeIO.isTruncate() || writeIO.isExtend();
|
||||
auto setResult = needCreateChunk ? store.createChunk(chunkId, chunkSize, chunkInfo, executor, job.allowToAllocate())
|
||||
: store.set(chunkId, chunkInfo, !skipPersist);
|
||||
if (UNLIKELY(!setResult)) {
|
||||
return makeError(std::move(setResult.error()));
|
||||
}
|
||||
result.commitChainVer = meta.chainVer;
|
||||
result.updateVer = meta.updateVer;
|
||||
|
||||
uint32_t chunkSizeBeforeWrite = meta.size;
|
||||
|
||||
// 3. do write operation.
|
||||
Result<uint32_t> writeResult = 0;
|
||||
if (writeIO.isTruncate() || writeIO.isExtend()) {
|
||||
if (writeIO.length <= meta.size) {
|
||||
if (writeIO.isTruncate()) {
|
||||
writeResult = (meta.size = writeIO.length);
|
||||
} else {
|
||||
writeResult = meta.size;
|
||||
}
|
||||
} else {
|
||||
// extend the chunk (fill zeros)
|
||||
writeResult = doRealWrite(chunkId, chunkInfo, kZeroBytes.data(), writeIO.length - meta.size, meta.size);
|
||||
if (writeResult) {
|
||||
writeResult = meta.size; // set result to the actual chunk length if write succeeds
|
||||
}
|
||||
}
|
||||
} else if (writeIO.isRemove()) {
|
||||
// remove.
|
||||
if (!meta.readyToRemove()) {
|
||||
meta.recycleState = RecycleState::REMOVAL_IN_PROGRESS;
|
||||
}
|
||||
writeResult = 0;
|
||||
} else {
|
||||
// fill zeros before the write range if there is a gap
|
||||
if (meta.size < writeIO.offset) {
|
||||
RETURN_AND_LOG_ON_ERROR(chunkInfo.view.write(kZeroBytes.data(), writeIO.offset - meta.size, meta.size, meta));
|
||||
}
|
||||
|
||||
// normal write.
|
||||
writeResult = doRealWrite(chunkId, chunkInfo, state.data, writeIO.length, writeIO.offset);
|
||||
if (writeResult) {
|
||||
if (options.isSyncing) meta.size = writeIO.length;
|
||||
storageUpdateSeqWrite.addSample(writeIO.offset == chunkSizeBeforeWrite);
|
||||
}
|
||||
}
|
||||
if (UNLIKELY(!writeResult)) {
|
||||
return writeResult; // chunk becomes dirty.
|
||||
}
|
||||
|
||||
// update chunk checksum
|
||||
auto checksumRes = updateChecksum(chunkInfo, writeIO, chunkSizeBeforeWrite, isAppendWrite);
|
||||
if (UNLIKELY(!checksumRes)) {
|
||||
return makeError(std::move(checksumRes.error()));
|
||||
}
|
||||
|
||||
// 4. finish to write.
|
||||
meta.chunkState = ChunkState::CLEAN;
|
||||
|
||||
XLOGF(DBG, "chunk {} {} write finish", chunkId, meta);
|
||||
setResult = store.set(chunkId, chunkInfo, !skipPersist);
|
||||
if (UNLIKELY(!setResult)) {
|
||||
return makeError(std::move(setResult.error()));
|
||||
}
|
||||
result.checksum = meta.checksum();
|
||||
result.commitVer = meta.commitVer;
|
||||
result.commitChainVer = meta.chainVer;
|
||||
|
||||
recordGuard.succ();
|
||||
return writeResult;
|
||||
}
|
||||
|
||||
Result<Void> ChunkReplica::updateChecksum(ChunkInfo &chunkInfo,
|
||||
UpdateIO writeIO,
|
||||
uint32_t chunkSizeBeforeWrite,
|
||||
bool isAppendWrite) {
|
||||
const auto &chunkId = writeIO.key.chunkId;
|
||||
ChunkMetadata &meta = chunkInfo.meta;
|
||||
auto chunkChecksum = meta.checksum();
|
||||
bool combineChecksum = chunkSizeBeforeWrite > 0 && isAppendWrite;
|
||||
|
||||
if (writeIO.isTruncate() || writeIO.isExtend()) {
|
||||
writeIO.checksum = ChecksumInfo::create(meta.checksumType, (const uint8_t *)nullptr, 0);
|
||||
writeIO.offset = meta.size;
|
||||
writeIO.length = 0;
|
||||
}
|
||||
|
||||
if (writeIO.checksum.type == ChecksumType::NONE || meta.size == 0) {
|
||||
meta.checksumValue = 0;
|
||||
storageUpdateChecksumNone.addSample(1);
|
||||
} else if (writeIO.offset == 0 && writeIO.length == meta.size) {
|
||||
meta.checksumValue = writeIO.checksum.value;
|
||||
storageUpdateChecksumReuse.addSample(1);
|
||||
} else if (writeIO.checksum.type == chunkChecksum.type && combineChecksum) {
|
||||
// combine the chunk checksum and write io checksum if this write appends to existing chunk
|
||||
auto combinResult = chunkChecksum.combine(writeIO.checksum, writeIO.length);
|
||||
|
||||
if (UNLIKELY(!combinResult)) {
|
||||
XLOGF(ERR,
|
||||
"Failed to combine checksums: error {}, chunkId {}, meta {}, write io: {}",
|
||||
combinResult.error(),
|
||||
chunkId,
|
||||
meta,
|
||||
writeIO);
|
||||
return makeError(combinResult.error());
|
||||
}
|
||||
|
||||
meta.checksumValue = chunkChecksum.value;
|
||||
storageUpdateChecksumCombine.addSample(1);
|
||||
} else {
|
||||
// read the prefix of chunk and compute its checksum
|
||||
auto prefixChecksum = chunkInfo.view.checksum(writeIO.checksum.type, writeIO.offset, 0, meta);
|
||||
|
||||
if (UNLIKELY(!prefixChecksum)) {
|
||||
XLOGF(ERR,
|
||||
"Failed to calculate chunk prefix checksum: error {}, chunkId {}, meta {}, write io: {}",
|
||||
prefixChecksum.error(),
|
||||
chunkId,
|
||||
meta,
|
||||
writeIO);
|
||||
return makeError(std::move(prefixChecksum.error()));
|
||||
}
|
||||
|
||||
// read the suffix of chunk and compute its checksum
|
||||
uint32_t suffixStart = std::min(writeIO.offset + writeIO.length, meta.size);
|
||||
uint32_t suffixLength = meta.size - suffixStart;
|
||||
auto suffixChecksum = chunkInfo.view.checksum(writeIO.checksum.type, suffixLength, suffixStart, meta);
|
||||
|
||||
if (UNLIKELY(!suffixChecksum)) {
|
||||
XLOGF(ERR,
|
||||
"Failed to calculate chunk suffix checksum: error {}, chunkId {}, meta {}, write io: {}",
|
||||
suffixChecksum.error(),
|
||||
chunkId,
|
||||
meta,
|
||||
writeIO);
|
||||
return makeError(std::move(suffixChecksum.error()));
|
||||
}
|
||||
|
||||
prefixChecksum->combine(writeIO.checksum, writeIO.length);
|
||||
prefixChecksum->combine(*suffixChecksum, suffixLength);
|
||||
|
||||
meta.checksumValue = prefixChecksum->value;
|
||||
storageUpdateChecksumReadChunk.addSample(1);
|
||||
}
|
||||
|
||||
meta.checksumType = writeIO.checksum.type;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// commit the version of this chunk.
|
||||
Result<uint32_t> ChunkReplica::commit(ChunkStore &store, UpdateJob &job) {
|
||||
auto recordGuard = storageCommitRecorder.record();
|
||||
|
||||
auto &commitIO = job.commitIO();
|
||||
auto &chunkId = commitIO.key.chunkId;
|
||||
auto &result = job.result();
|
||||
|
||||
// 1. get meta info.
|
||||
auto getResult = store.get(chunkId);
|
||||
if (commitIO.isRemove && !getResult && getResult.error().code() == StorageCode::kChunkMetadataNotFound) {
|
||||
result.commitVer = result.updateVer = commitIO.commitVer;
|
||||
result.commitChainVer = commitIO.commitChainVer;
|
||||
return 0;
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(getResult);
|
||||
|
||||
auto chunkInfo = (*getResult)->second;
|
||||
ChunkMetadata &meta = chunkInfo.meta;
|
||||
result.commitVer = meta.commitVer;
|
||||
result.updateVer = meta.updateVer;
|
||||
result.commitChainVer = meta.chainVer;
|
||||
|
||||
if (job.commitChainVer() < meta.chainVer) {
|
||||
auto msg = fmt::format("chunk {} {} chain version mismatch {}", chunkId, meta, commitIO);
|
||||
reportFatalEvent();
|
||||
XLOG(DFATAL, msg);
|
||||
return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
|
||||
}
|
||||
if (commitIO.commitVer > meta.updateVer) {
|
||||
auto msg = fmt::format("chunk {} meta {} commit version mismatch", chunkId, meta);
|
||||
reportFatalEvent();
|
||||
XLOG(DFATAL, msg);
|
||||
return makeError(StorageCode::kChunkVersionMismatch, std::move(msg));
|
||||
}
|
||||
|
||||
if (commitIO.isForce) {
|
||||
meta.chunkState = ChunkState::CLEAN;
|
||||
meta.commitVer = commitIO.commitVer;
|
||||
} else if (meta.chunkState == ChunkState::DIRTY) {
|
||||
auto msg = fmt::format("chunk {} is dirty {}", chunkId, meta);
|
||||
XLOG(ERR, msg);
|
||||
storageCommitDirty.addSample(1);
|
||||
return makeError(StorageCode::kChunkNotClean, std::move(msg));
|
||||
} else if (meta.commitVer < commitIO.commitVer) {
|
||||
meta.commitVer = commitIO.commitVer;
|
||||
} else {
|
||||
auto msg = fmt::format("chunk {} stale commit {} > {}", chunkId, meta.commitVer, commitIO.commitVer);
|
||||
XLOG(ERR, msg);
|
||||
storageCommitStale.addSample(1);
|
||||
result.commitVer = meta.commitVer;
|
||||
result.commitChainVer = meta.chainVer;
|
||||
return makeError(StorageCode::kChunkStaleCommit, std::move(msg));
|
||||
}
|
||||
|
||||
if (meta.commitVer == meta.updateVer) {
|
||||
meta.chunkState = ChunkState::COMMIT;
|
||||
meta.chainVer = job.commitChainVer();
|
||||
}
|
||||
meta.lastRequestId = job.requestCtx().tag.requestId;
|
||||
meta.lastClientUuid = job.requestCtx().tag.clientId.uuid;
|
||||
meta.timestamp = UtcClock::now();
|
||||
auto metaResult = meta.readyToRemove() ? store.remove(chunkId, chunkInfo) : store.set(chunkId, chunkInfo);
|
||||
if (UNLIKELY(!metaResult)) {
|
||||
return makeError(std::move(metaResult.error()));
|
||||
}
|
||||
result.commitVer = meta.commitVer;
|
||||
result.commitChainVer = meta.chainVer;
|
||||
|
||||
recordGuard.succ();
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
33
src/storage/store/ChunkReplica.h
Normal file
33
src/storage/store/ChunkReplica.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/Range.h>
|
||||
|
||||
#include "common/utils/Result.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
#include "storage/store/ChunkStore.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class ChunkReplica {
|
||||
public:
|
||||
// prepare aio read.
|
||||
static Result<Void> aioPrepareRead(ChunkStore &store, AioReadJob &job);
|
||||
|
||||
// finish aio read.
|
||||
static Result<Void> aioFinishRead(ChunkStore &store, AioReadJob &job);
|
||||
|
||||
// do write.
|
||||
static Result<uint32_t> update(ChunkStore &store, UpdateJob &job, folly::CPUThreadPoolExecutor &executor);
|
||||
|
||||
static Result<Void> updateChecksum(ChunkInfo &chunkInfo,
|
||||
UpdateIO writeIO,
|
||||
uint32_t chunkSizeBeforeWrite,
|
||||
bool isAppendWrite);
|
||||
|
||||
// commit the version of this chunk.
|
||||
static Result<uint32_t> commit(ChunkStore &store, UpdateJob &job);
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
216
src/storage/store/ChunkStore.cc
Normal file
216
src/storage/store/ChunkStore.cc
Normal file
@@ -0,0 +1,216 @@
|
||||
#include "storage/store/ChunkStore.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <fcntl.h>
|
||||
#include <folly/Hash.h>
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
#include "storage/store/ChunkReplica.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
monitor::OperationRecorder chunkStoreCreateRecorder{"storage.chunk_store_create"};
|
||||
monitor::OperationRecorder chunkStoreSetRecorder{"storage.chunk_store_set"};
|
||||
monitor::CountRecorder chunkStoreSetWriteDownRecorder{"storage.chunk_store_set.write_down"};
|
||||
monitor::CountRecorder chunkStoreSetWriteSkipRecorder{"storage.chunk_store_set.write_skip"};
|
||||
|
||||
monitor::OperationRecorder listAllChunkIdsRecorder{"storage.list_all_chunks"};
|
||||
monitor::DistributionRecorder chunkCountRecorder{"storage.list_all_chunks.chunk_count"};
|
||||
monitor::DistributionRecorder uncommittedRecorder{"storage.list_all_chunks.uncommitted_count"};
|
||||
|
||||
monitor::OperationRecorder queryChunksRecorder{"storage.query_chunks"};
|
||||
monitor::OperationRecorder resetUncommittedRecorder{"storage.reset_uncommitted"};
|
||||
|
||||
// initialize chunk store.
|
||||
Result<Void> ChunkStore::create(const PhysicalConfig &config) {
|
||||
RETURN_AND_LOG_ON_ERROR(fileStore_.create(config));
|
||||
RETURN_AND_LOG_ON_ERROR(metaStore_.create(config_.kv_store(), config));
|
||||
targetId_ = TargetId{config.target_id};
|
||||
tag_ = {{"instance", std::to_string(targetId_)}};
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// load chunk store.
|
||||
Result<Void> ChunkStore::load(const PhysicalConfig &config) {
|
||||
RETURN_AND_LOG_ON_ERROR(fileStore_.load(config));
|
||||
RETURN_AND_LOG_ON_ERROR(metaStore_.load(config_.kv_store(), config));
|
||||
targetId_ = TargetId{config.target_id};
|
||||
tag_ = {{"instance", std::to_string(targetId_)}};
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// add new chunk size.
|
||||
Result<Void> ChunkStore::addChunkSize(const std::vector<Size> &sizeList) {
|
||||
RETURN_AND_LOG_ON_ERROR(fileStore_.addChunkSize(sizeList));
|
||||
RETURN_AND_LOG_ON_ERROR(metaStore_.addChunkSize(sizeList));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<ChunkStore::Map::ConstIterator> ChunkStore::get(const ChunkId &chunkId) {
|
||||
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
|
||||
// 1. find in cache.
|
||||
auto it = map_.find(chunkId);
|
||||
if (it != map_.end()) {
|
||||
return Result<Map::ConstIterator>(std::move(it));
|
||||
}
|
||||
|
||||
// 2. load from DB.
|
||||
ChunkInfo chunkInfo;
|
||||
auto metaResult = metaStore_.get(chunkId, chunkInfo.meta);
|
||||
if (metaResult) {
|
||||
auto openResult = fileStore_.open(chunkInfo.meta.innerFileId);
|
||||
RETURN_AND_LOG_ON_ERROR(openResult);
|
||||
chunkInfo.view = *openResult;
|
||||
auto [it, succ] = map_.emplace(chunkId, chunkInfo);
|
||||
return Result<Map::ConstIterator>(std::move(it));
|
||||
}
|
||||
return makeError(std::move(metaResult.error()));
|
||||
}
|
||||
|
||||
Result<Void> ChunkStore::createChunk(const ChunkId &chunkId,
|
||||
uint32_t chunkSize,
|
||||
ChunkInfo &chunkInfo,
|
||||
folly::CPUThreadPoolExecutor &executor,
|
||||
bool allowToAllocate) {
|
||||
auto recordGuard = chunkStoreCreateRecorder.record();
|
||||
auto metaResult = metaStore_.createChunk(chunkId, chunkInfo.meta, chunkSize, executor, allowToAllocate);
|
||||
RETURN_AND_LOG_ON_ERROR(metaResult);
|
||||
auto openResult = fileStore_.open(chunkInfo.meta.innerFileId);
|
||||
RETURN_AND_LOG_ON_ERROR(openResult);
|
||||
chunkInfo.view = *openResult;
|
||||
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
|
||||
map_.insert_or_assign(chunkId, chunkInfo);
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkStore::set(const ChunkId &chunkId, const ChunkInfo &chunkInfo, bool persist /* = true */) {
|
||||
auto recordGuard = chunkStoreSetRecorder.record();
|
||||
if (persist || config_.force_persist()) {
|
||||
chunkStoreSetWriteDownRecorder.addSample(1);
|
||||
auto metaResult = metaStore_.set(chunkId, chunkInfo.meta);
|
||||
RETURN_AND_LOG_ON_ERROR(metaResult);
|
||||
} else {
|
||||
chunkStoreSetWriteSkipRecorder.addSample(1);
|
||||
}
|
||||
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
|
||||
map_.insert_or_assign(chunkId, chunkInfo);
|
||||
recordGuard.succ();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkStore::remove(ChunkId chunkId, ChunkInfo &chunkInfo) {
|
||||
if (UNLIKELY(!chunkInfo.meta.readyToRemove())) {
|
||||
return makeError(StorageCode::kChunkNotReadyToRemove);
|
||||
}
|
||||
XLOGF(DBG, "ready to remove: {}", chunkInfo.meta);
|
||||
auto getResult = get(chunkId);
|
||||
RETURN_AND_LOG_ON_ERROR(getResult);
|
||||
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
|
||||
RETURN_AND_LOG_ON_ERROR(metaStore_.remove(chunkId, chunkInfo.meta));
|
||||
map_.erase(*getResult);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> ChunkStore::queryChunks(const ChunkIdRange &chunkIdRange) {
|
||||
auto recordGuard = queryChunksRecorder.record(tag_);
|
||||
|
||||
std::vector<std::pair<ChunkId, ChunkMetadata>> chunkIds;
|
||||
|
||||
auto it = metaStore_.iterator(chunkIdRange.end.data());
|
||||
RETURN_AND_LOG_ON_ERROR(it);
|
||||
|
||||
for (; it->valid() && chunkIds.size() < chunkIdRange.maxNumChunkIdsToProcess; it->next()) {
|
||||
auto chunkId = it->chunkId();
|
||||
auto metadata = it->meta();
|
||||
|
||||
if (chunkId == chunkIdRange.end) { // [begin, end)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (chunkId < chunkIdRange.begin) {
|
||||
break;
|
||||
}
|
||||
|
||||
RETURN_AND_LOG_ON_ERROR(metadata);
|
||||
chunkIds.emplace_back(chunkId, *metadata);
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(it->status());
|
||||
|
||||
recordGuard.succ();
|
||||
return chunkIds;
|
||||
}
|
||||
|
||||
Result<Void> ChunkStore::getAllMetadata(ChunkMetaVector &metas) {
|
||||
auto recordGuard = listAllChunkIdsRecorder.record(tag_);
|
||||
|
||||
auto it = metaStore_.iterator();
|
||||
RETURN_AND_LOG_ON_ERROR(it);
|
||||
|
||||
for (; it->valid(); it->next()) {
|
||||
auto chunkId = it->chunkId();
|
||||
auto metaResult = it->meta();
|
||||
if (UNLIKELY(!metaResult)) {
|
||||
XLOGF(ERR, "chunk {} parse meta failed {}", chunkId, metaResult.error());
|
||||
return makeError(std::move(metaResult.error()));
|
||||
}
|
||||
auto &meta = *metaResult;
|
||||
metas.emplace_back();
|
||||
metas.back().chunkId = std::move(chunkId);
|
||||
metas.back().updateVer = meta.updateVer;
|
||||
metas.back().commitVer = meta.commitVer;
|
||||
metas.back().chainVer = meta.chainVer;
|
||||
metas.back().chunkState = meta.chunkState;
|
||||
metas.back().checksum = meta.checksum();
|
||||
metas.back().length = meta.size;
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(it->status());
|
||||
|
||||
recordGuard.succ();
|
||||
chunkCountRecorder.addSample(metas.size(), tag_);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ChunkStore::resetUncommitted(ChainVer chainVer) {
|
||||
auto &uncommitted = metaStore_.uncommitted();
|
||||
if (uncommitted.empty()) {
|
||||
return Void{};
|
||||
}
|
||||
|
||||
XLOGF(CRITICAL, "reset uncommitted chunks, size: {}", uncommitted.size());
|
||||
for (auto &chunkId : uncommitted) {
|
||||
auto recordGuard = resetUncommittedRecorder.record();
|
||||
auto getResult = get(chunkId);
|
||||
if (!getResult) {
|
||||
XLOGF(ERR, "reset uncommitted chunk {} not found", chunkId);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto chunkInfo = (*getResult)->second;
|
||||
XLOGF(CRITICAL, "reset uncommitted chunk {} meta {}", chunkId, chunkInfo.meta);
|
||||
|
||||
CommitIO commitIO;
|
||||
commitIO.key.chunkId = chunkId;
|
||||
commitIO.commitVer = chunkInfo.meta.updateVer;
|
||||
commitIO.isForce = true;
|
||||
commitIO.commitChainVer = chainVer;
|
||||
ServiceRequestContext requestCtx{"commit", MessageTag(ClientId{Uuid::max()}, {})};
|
||||
ChunkEngineUpdateJob updateChunk{};
|
||||
UpdateJob updateJob(requestCtx, commitIO, {}, updateChunk, nullptr);
|
||||
auto commitResult = ChunkReplica::commit(*this, updateJob);
|
||||
if (!commitResult) {
|
||||
XLOGF(ERR, "reset uncommitted chunk {} set failed: {}", chunkId, commitResult.error());
|
||||
continue;
|
||||
}
|
||||
recordGuard.succ();
|
||||
}
|
||||
uncommitted.clear();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
117
src/storage/store/ChunkStore.h
Normal file
117
src/storage/store/ChunkStore.h
Normal file
@@ -0,0 +1,117 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/concurrency/ConcurrentHashMap.h>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/FdWrapper.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "storage/store/ChunkFileStore.h"
|
||||
#include "storage/store/ChunkMetaStore.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
enum class PointQueryStrategy {
|
||||
NONE,
|
||||
CLASSIC,
|
||||
MODERN,
|
||||
};
|
||||
|
||||
class ChunkStore {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_OBJ(kv_store, kv::KVStore::Config);
|
||||
CONFIG_OBJ(file_store, ChunkFileStore::Config);
|
||||
CONFIG_OBJ(meta_store, ChunkMetaStore::Config);
|
||||
CONFIG_ITEM(mutex_num, 257u, ConfigCheckers::isPositivePrime<uint32_t>);
|
||||
CONFIG_ITEM(kv_path, Path{});
|
||||
CONFIG_HOT_UPDATED_ITEM(migrate_kv_store, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(force_persist, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(point_query_strategy, PointQueryStrategy::NONE);
|
||||
};
|
||||
|
||||
using Map = folly::ConcurrentHashMap<ChunkId, ChunkInfo>;
|
||||
|
||||
ChunkStore(const Config &config, GlobalFileStore &globalFileStore)
|
||||
: config_(config),
|
||||
fileStore_(config_.file_store(), globalFileStore),
|
||||
metaStore_(config_.meta_store(), fileStore_) {}
|
||||
|
||||
ChunkMetaStore &chunkMetaStore() { return metaStore_; }
|
||||
|
||||
// create chunk store.
|
||||
Result<Void> create(const PhysicalConfig &config);
|
||||
|
||||
// load chunk store.
|
||||
Result<Void> load(const PhysicalConfig &config);
|
||||
|
||||
// add new chunk size.
|
||||
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
|
||||
|
||||
// migrate meta store.
|
||||
Result<Void> migrate(const PhysicalConfig &config) { return metaStore_.migrate(config_.kv_store(), config); }
|
||||
|
||||
// get meta of a chunk file.
|
||||
Result<Map::ConstIterator> get(const ChunkId &chunkId);
|
||||
|
||||
// create a new chunk file.
|
||||
Result<Void> createChunk(const ChunkId &chunkId,
|
||||
uint32_t chunkSize,
|
||||
ChunkInfo &chunkInfo,
|
||||
folly::CPUThreadPoolExecutor &executor,
|
||||
bool allowToAllocate);
|
||||
|
||||
// set meta of a chunk file.
|
||||
Result<Void> set(const ChunkId &chunkId, const ChunkInfo &chunkInfo, bool persist = true);
|
||||
|
||||
// remove a chunk file.
|
||||
Result<Void> remove(ChunkId chunkId, ChunkInfo &chunkInfo);
|
||||
|
||||
// recycle a batch of chunks.
|
||||
Result<bool> punchHole() { return metaStore_.punchHole(); }
|
||||
|
||||
// sync meta kv.
|
||||
Result<Void> sync() { return metaStore_.sync(); }
|
||||
|
||||
// query chunks: the chunk ids in result are in reverse lexicographical order
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(const ChunkIdRange &chunkIdRange);
|
||||
|
||||
// list all chunk ids.
|
||||
Result<Void> getAllMetadata(ChunkMetaVector &metas);
|
||||
|
||||
// get meta iterator.
|
||||
auto metaIterator() { return metaStore_.iterator(); }
|
||||
|
||||
// get used size.
|
||||
uint64_t usedSize() const { return metaStore_.usedSize(); }
|
||||
|
||||
// get reserved and unrecycled size.
|
||||
Result<Void> unusedSize(int64_t &reservedSize, int64_t &unrecycledSize) {
|
||||
return metaStore_.unusedSize(reservedSize, unrecycledSize);
|
||||
}
|
||||
|
||||
// get all uncommitted chunk ids.
|
||||
const auto &uncommitted() { return metaStore_.uncommitted(); }
|
||||
|
||||
// reset uncommitted chunk to committed state.
|
||||
Result<Void> resetUncommitted(ChainVer chainVer);
|
||||
|
||||
// enable or disable emergency recycling.
|
||||
void setEmergencyRecycling(bool enable) { return metaStore_.setEmergencyRecycling(enable); }
|
||||
|
||||
private:
|
||||
const Config &config_;
|
||||
ChunkFileStore fileStore_;
|
||||
ChunkMetaStore metaStore_;
|
||||
TargetId targetId_;
|
||||
monitor::TagSet tag_;
|
||||
static constexpr auto kShardsNum = 32u;
|
||||
std::array<Map, kShardsNum> maps_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
77
src/storage/store/GlobalFileStore.cc
Normal file
77
src/storage/store/GlobalFileStore.cc
Normal file
@@ -0,0 +1,77 @@
|
||||
#include "storage/store/GlobalFileStore.h"
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <folly/logging/xlog.h>
|
||||
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/Size.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
Result<FileDescriptor *> GlobalFileStore::open(const Path &filePath, bool createFile /* = false */) {
|
||||
return shards_.withLock(
|
||||
[&](FdMap &map) -> Result<FileDescriptor *> {
|
||||
auto &innerFile = map[filePath];
|
||||
if (innerFile.normal_.valid()) {
|
||||
return &innerFile;
|
||||
}
|
||||
|
||||
// 3. open file.
|
||||
FileDescriptor file;
|
||||
{
|
||||
// open in normal mode.
|
||||
auto flags = O_RDWR | O_SYNC;
|
||||
int ret = createFile ? ::open(filePath.c_str(), O_CREAT | flags, 0644) : ::open(filePath.c_str(), flags);
|
||||
if (UNLIKELY(ret == -1)) {
|
||||
auto msg = fmt::format("chunk store open file {} failed: errno {}", filePath, errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
|
||||
}
|
||||
file.normal_ = ret;
|
||||
}
|
||||
|
||||
{
|
||||
// open in direct mode.
|
||||
auto flags = O_RDWR | O_DIRECT;
|
||||
int ret = ::open(filePath.c_str(), flags);
|
||||
if (UNLIKELY(ret == -1)) {
|
||||
auto msg = fmt::format("chunk store open file {} failed: errno {}", filePath, errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
|
||||
}
|
||||
file.direct_ = ret;
|
||||
}
|
||||
|
||||
innerFile = std::move(file);
|
||||
return &innerFile;
|
||||
},
|
||||
filePath);
|
||||
}
|
||||
|
||||
void GlobalFileStore::collect(std::vector<int> &fds) {
|
||||
fds.clear();
|
||||
fds.reserve(128_KB);
|
||||
shards_.iterate([&](FdMap &map) {
|
||||
for (auto &[path, fd] : map) {
|
||||
fd.index_ = fds.size();
|
||||
fds.push_back(fd.direct_);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Result<Void> GlobalFileStore::clear(CPUExecutorGroup &executor) {
|
||||
std::atomic<uint32_t> finished = 0;
|
||||
shards_.iterate([&](FdMap &map) {
|
||||
executor.pickNext().add([&, m = std::move(map)]() mutable {
|
||||
m.clear();
|
||||
++finished;
|
||||
});
|
||||
});
|
||||
for (int i = 0; finished != kShardsNum; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for clear fd finished...");
|
||||
std::this_thread::sleep_for(50_ms);
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
36
src/storage/store/GlobalFileStore.h
Normal file
36
src/storage/store/GlobalFileStore.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/ThreadLocal.h>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <mutex>
|
||||
|
||||
#include "common/utils/CPUExecutorGroup.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/FdWrapper.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/Shards.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct FileDescriptor {
|
||||
FdWrapper normal_;
|
||||
FdWrapper direct_;
|
||||
std::optional<uint32_t> index_{};
|
||||
};
|
||||
|
||||
class GlobalFileStore {
|
||||
public:
|
||||
Result<FileDescriptor *> open(const Path &filePath, bool createFile = false);
|
||||
|
||||
void collect(std::vector<int> &fds);
|
||||
|
||||
Result<Void> clear(CPUExecutorGroup &executor);
|
||||
|
||||
private:
|
||||
constexpr static auto kShardsNum = 256u;
|
||||
using FdMap = std::unordered_map<Path, FileDescriptor>;
|
||||
Shards<FdMap, kShardsNum> shards_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
38
src/storage/store/PhysicalConfig.h
Normal file
38
src/storage/store/PhysicalConfig.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "common/utils/Size.h"
|
||||
#include "kv/KVStore.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
// Physical configuration of the storage target. Store in `target.toml`.
|
||||
static inline constexpr auto kPhysicalConfigFileName = "target.toml";
|
||||
|
||||
class PhysicalConfig {
|
||||
SERDE_STRUCT_FIELD(path, Path{});
|
||||
SERDE_STRUCT_FIELD(target_id, uint64_t{});
|
||||
SERDE_STRUCT_FIELD(block_device_uuid, std::string{});
|
||||
SERDE_STRUCT_FIELD(allow_disk_without_uuid, false);
|
||||
SERDE_STRUCT_FIELD(allow_existing_targets, false);
|
||||
|
||||
SERDE_STRUCT_FIELD(physical_file_count, 256u);
|
||||
SERDE_STRUCT_FIELD(chunk_size_list, (std::vector<Size>{512_KB, 1_MB, 2_MB, 4_MB, 16_MB, 64_MB}));
|
||||
SERDE_STRUCT_FIELD(chain_id, uint32_t{});
|
||||
SERDE_STRUCT_FIELD(kv_store_type, kv::KVStore::Type::LevelDB);
|
||||
SERDE_STRUCT_FIELD(has_sentinel, false);
|
||||
SERDE_STRUCT_FIELD(kv_store_name, std::string{"meta"});
|
||||
SERDE_STRUCT_FIELD(kv_path, std::optional<Path>{});
|
||||
SERDE_STRUCT_FIELD(only_chunk_engine, false);
|
||||
|
||||
public:
|
||||
Path kvPath() const {
|
||||
if (kv_path.has_value()) {
|
||||
return *kv_path / fmt::format("{}_{}", kv_store_name, target_id);
|
||||
}
|
||||
return path / kv_store_name;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
445
src/storage/store/StorageTarget.cc
Normal file
445
src/storage/store/StorageTarget.cc
Normal file
@@ -0,0 +1,445 @@
|
||||
#include "storage/store/StorageTarget.h"
|
||||
|
||||
#include <boost/filesystem/operations.hpp>
|
||||
#include <folly/experimental/symbolizer/Symbolizer.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/MagicEnum.hpp"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/SysResource.h"
|
||||
#include "storage/store/ChunkEngine.h"
|
||||
#include "storage/store/ChunkReplica.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::CountRecorder storageUpdateChecksumMismatch{"storage.chunk_update.checksum_mismatch"};
|
||||
monitor::CountRecorder storageUpdateReplace{"storage.chunk_update.replace"};
|
||||
monitor::CountRecorder storageUpdateCommitted{"storage.chunk_update.committed"};
|
||||
monitor::CountRecorder storageUpdateStale{"storage.chunk_update.stale"};
|
||||
monitor::CountRecorder storageUpdateMissing{"storage.chunk_update.missing"};
|
||||
monitor::CountRecorder storageUpdateAdvance{"storage.chunk_update.advance"};
|
||||
monitor::CountRecorder storageWriteTimes{"storage.chunk_write.times"};
|
||||
monitor::CountRecorder storageRemoveTimes{"storage.chunk_remove.times"};
|
||||
monitor::CountRecorder storageTruncateTimes{"storage.chunk_truncate.times"};
|
||||
|
||||
monitor::CountRecorder aioReadCountPerDisk{"storage.aio_read.count_per_disk"};
|
||||
monitor::CountRecorder aioReadBytesPerDisk{"storage.aio_read.bytes_per_disk"};
|
||||
monitor::CountRecorder aioReadSuccBytesPerDisk{"storage.aio_read.succ_bytes_per_disk"};
|
||||
monitor::LatencyRecorder aioReadSuccLatencyPerDisk{"storage.aio_read.succ_latency_per_disk"};
|
||||
monitor::ValueRecorder targetUsedSize{"storage.target.used_size", std::nullopt, false};
|
||||
monitor::ValueRecorder targetReservedSize{"storage.target.reserved_size", std::nullopt, false};
|
||||
monitor::ValueRecorder targetUnrecycledSize{"storage.target.unrecycled_size", std::nullopt, false};
|
||||
monitor::OperationRecorder pointQueryRecorder{"storage.point_query"};
|
||||
std::atomic<uint32_t> gGenerationId{};
|
||||
|
||||
Result<std::string> getDeviceUUID(const Path &path) {
|
||||
struct stat st;
|
||||
int ret = ::stat(path.c_str(), &st);
|
||||
if (ret != 0) {
|
||||
auto msg = fmt::format("stat {} failed: {}", path.string(), errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageStatFailed, std::move(msg));
|
||||
}
|
||||
|
||||
auto getDeviceUUIDResult = SysResource::fileSystemUUID();
|
||||
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
|
||||
if (!getDeviceUUIDResult->count(st.st_dev)) {
|
||||
auto msg = fmt::format("Not found UUID for path {} device {}", path.string(), st.st_dev);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageUUIDMismatch, std::move(msg));
|
||||
}
|
||||
return getDeviceUUIDResult->at(st.st_dev);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
StorageTarget::StorageTarget(const Config &config,
|
||||
GlobalFileStore &globalFileStore,
|
||||
uint32_t diskIndex,
|
||||
chunk_engine::Engine *engine)
|
||||
: config_(config),
|
||||
diskIndex_(diskIndex),
|
||||
generationId_(++gGenerationId),
|
||||
engine_(engine),
|
||||
diskTag_(monitor::instanceTagSet(std::to_string(diskIndex))),
|
||||
targetTag_(monitor::instanceTagSet(std::to_string(0))),
|
||||
readCountPerDisk_(aioReadCountPerDisk.getRecoderWithTag(diskTag_)),
|
||||
readBytesPerDisk_(aioReadBytesPerDisk.getRecoderWithTag(diskTag_)),
|
||||
readSuccBytesPerDisk_(aioReadSuccBytesPerDisk.getRecoderWithTag(diskTag_)),
|
||||
readSuccLatencyPerDisk_(aioReadSuccLatencyPerDisk.getRecoderWithTag(diskTag_)),
|
||||
targetUsedSize_(targetUsedSize.getRecoderWithTag(targetTag_)),
|
||||
targetReservedSize_(targetReservedSize.getRecoderWithTag(targetTag_)),
|
||||
targetUnrecycledSize_(targetUnrecycledSize.getRecoderWithTag(targetTag_)),
|
||||
chunkStore_(config_, globalFileStore) {}
|
||||
|
||||
StorageTarget::~StorageTarget() {
|
||||
if (released_) {
|
||||
return;
|
||||
}
|
||||
auto result = sync();
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", targetConfig_.path, result.error());
|
||||
}
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::create(const PhysicalConfig &config) {
|
||||
Path targetConfigFilePath = config.path / kPhysicalConfigFileName;
|
||||
if (boost::filesystem::exists(targetConfigFilePath)) {
|
||||
auto msg = fmt::format("Target config file {} already exists", targetConfigFilePath.string());
|
||||
XLOG(INFO, msg);
|
||||
if (!config.allow_existing_targets) {
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(load(config.path));
|
||||
if (targetConfig_.target_id != config.target_id) {
|
||||
auto msg = fmt::format("target id is different: {} != {}", targetConfig_.target_id, config.target_id);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (targetConfig_.physical_file_count != config.physical_file_count) {
|
||||
auto msg = fmt::format("Physical file count is different: {} != {}",
|
||||
targetConfig_.physical_file_count,
|
||||
config.physical_file_count);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(addChunkSize(config.chunk_size_list));
|
||||
XLOGF(INFO, "Target config file {} check passed", targetConfigFilePath.string());
|
||||
return Void{};
|
||||
}
|
||||
|
||||
targetConfig_ = config;
|
||||
targetConfig_.has_sentinel = true;
|
||||
auto kvPath = config_.kv_path();
|
||||
if (kvPath.empty()) {
|
||||
targetConfig_.kv_path = std::nullopt;
|
||||
} else {
|
||||
targetConfig_.kv_path = kvPath;
|
||||
}
|
||||
if (useChunkEngine()) {
|
||||
boost::system::error_code ec{};
|
||||
boost::filesystem::create_directories(targetConfig_.path, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("target create directory {} failed: {}", targetConfig_.path.string(), ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
|
||||
}
|
||||
} else {
|
||||
RETURN_AND_LOG_ON_ERROR(chunkStore_.create(targetConfig_));
|
||||
}
|
||||
|
||||
auto getDeviceUUIDResult = getDeviceUUID(targetConfig_.path);
|
||||
if (getDeviceUUIDResult) {
|
||||
targetConfig_.block_device_uuid = *getDeviceUUIDResult;
|
||||
} else if (targetConfig_.allow_disk_without_uuid) {
|
||||
targetConfig_.block_device_uuid = "";
|
||||
} else {
|
||||
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
|
||||
}
|
||||
|
||||
std::ofstream targetConfigFile(targetConfigFilePath, std::ios::out);
|
||||
if (!targetConfigFile) {
|
||||
auto msg = fmt::format("Open target config file {} failed", targetConfigFilePath.string());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (!(targetConfigFile << serde::toTomlString(targetConfig_))) {
|
||||
auto msg = fmt::format("Write target config file {} failed", targetConfigFilePath.string());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
*chunkSizeList_.lock() = {targetConfig_.chunk_size_list.begin(), targetConfig_.chunk_size_list.end()};
|
||||
targetTag_ = monitor::instanceTagSet(std::to_string(targetConfig_.target_id));
|
||||
targetUsedSize_ = targetUsedSize.getRecoderWithTag(targetTag_);
|
||||
targetReservedSize_ = targetReservedSize.getRecoderWithTag(targetTag_);
|
||||
targetUnrecycledSize_ = targetUnrecycledSize.getRecoderWithTag(targetTag_);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::load(const Path &path) {
|
||||
RETURN_AND_LOG_ON_ERROR(serde::fromTomlFile(targetConfig_, path / kPhysicalConfigFileName));
|
||||
if (path != targetConfig_.path) {
|
||||
auto msg = fmt::format("Path config mismatch {} != real {}", targetConfig_.path, path);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
auto getDeviceUUIDResult = getDeviceUUID(path);
|
||||
if (!getDeviceUUIDResult) {
|
||||
if (targetConfig_.allow_disk_without_uuid) {
|
||||
getDeviceUUIDResult = "";
|
||||
} else {
|
||||
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
|
||||
}
|
||||
}
|
||||
if (targetConfig_.block_device_uuid != *getDeviceUUIDResult) {
|
||||
auto msg = fmt::format("UUID mismatch config {} != real {}", targetConfig_.block_device_uuid, *getDeviceUUIDResult);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageUUIDMismatch, std::move(msg));
|
||||
}
|
||||
|
||||
if (!targetConfig_.only_chunk_engine) {
|
||||
RETURN_AND_LOG_ON_ERROR(chunkStore_.load(targetConfig_));
|
||||
}
|
||||
if (!targetConfig_.only_chunk_engine && config_.migrate_kv_store() &&
|
||||
config_.kv_store().type() != targetConfig_.kv_store_type) {
|
||||
XLOGF(WARNING, "start migrate kv {} -> {}", targetConfig_, magic_enum::enum_name(config_.kv_store().type()));
|
||||
targetConfig_.kv_store_name = "kv";
|
||||
targetConfig_.kv_store_type = config_.kv_store().type();
|
||||
targetConfig_.has_sentinel = true;
|
||||
RETURN_AND_LOG_ON_ERROR(chunkStore_.migrate(targetConfig_));
|
||||
Path targetConfigFilePath = path / kPhysicalConfigFileName;
|
||||
std::ofstream targetConfigFile(targetConfigFilePath, std::ios::out);
|
||||
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(targetConfig_))) {
|
||||
auto msg = fmt::format("Write target config file {} failed", targetConfigFilePath.string());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
XLOGF(WARNING, "finish migrate kv {} -> {}", targetConfig_, magic_enum::enum_name(config_.kv_store().type()));
|
||||
}
|
||||
*chunkSizeList_.lock() = {targetConfig_.chunk_size_list.begin(), targetConfig_.chunk_size_list.end()};
|
||||
targetTag_ = monitor::instanceTagSet(std::to_string(targetConfig_.target_id));
|
||||
targetUsedSize_ = targetUsedSize.getRecoderWithTag(targetTag_);
|
||||
targetReservedSize_ = targetReservedSize.getRecoderWithTag(targetTag_);
|
||||
targetUnrecycledSize_ = targetUnrecycledSize.getRecoderWithTag(targetTag_);
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::addChunkSize(const std::vector<Size> &sizeList) {
|
||||
if (useChunkEngine()) {
|
||||
return Void{};
|
||||
}
|
||||
|
||||
auto chunkSizeListGuard = chunkSizeList_.lock();
|
||||
|
||||
std::vector<Size> newSizeList;
|
||||
for (auto size : sizeList) {
|
||||
if (!chunkSizeListGuard->contains(size)) {
|
||||
newSizeList.push_back(size);
|
||||
}
|
||||
}
|
||||
if (newSizeList.empty()) {
|
||||
return Void{};
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(chunkStore_.addChunkSize(newSizeList));
|
||||
|
||||
for (auto size : *chunkSizeListGuard) {
|
||||
newSizeList.push_back(size);
|
||||
}
|
||||
std::sort(newSizeList.begin(), newSizeList.end());
|
||||
|
||||
auto newTargetConfig = targetConfig_;
|
||||
newTargetConfig.chunk_size_list = newSizeList;
|
||||
Path tempPath = newTargetConfig.path / fmt::format("{}.tmp", kPhysicalConfigFileName);
|
||||
std::ofstream targetConfigFile(tempPath, std::ios::out);
|
||||
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(newTargetConfig))) {
|
||||
auto msg = fmt::format("Write target config file {} failed", tempPath.string());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
Path targetPath = newTargetConfig.path / kPhysicalConfigFileName;
|
||||
boost::system::error_code ec;
|
||||
boost::filesystem::rename(tempPath, targetPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("Re-write target config file {} failed, error: {}", targetPath.string(), ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
for (auto size : sizeList) {
|
||||
chunkSizeListGuard->insert(size);
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::setChainId(ChainId chainId) {
|
||||
auto chunkSizeListGuard = chunkSizeList_.lock();
|
||||
if (targetConfig_.chain_id != 0) {
|
||||
return Void{};
|
||||
}
|
||||
|
||||
auto newTargetConfig = targetConfig_;
|
||||
newTargetConfig.chain_id = chainId;
|
||||
Path tempPath = newTargetConfig.path / fmt::format("{}.tmp", kPhysicalConfigFileName);
|
||||
std::ofstream targetConfigFile(tempPath, std::ios::out);
|
||||
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(newTargetConfig))) {
|
||||
auto msg = fmt::format("Write target config file {} failed", tempPath.string());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
Path targetPath = newTargetConfig.path / kPhysicalConfigFileName;
|
||||
boost::system::error_code ec;
|
||||
boost::filesystem::rename(tempPath, targetPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("Re-write target config file {} failed, error: {}", targetPath.string(), ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
targetConfig_.chain_id = chainId;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// prepare aio read.
|
||||
Result<Void> StorageTarget::aioPrepareRead(AioReadJob &job) {
|
||||
readCountPerDisk_->addSample(1);
|
||||
readBytesPerDisk_->addSample(job.alignedLength());
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::aioPrepareRead(*engine_, job);
|
||||
} else {
|
||||
return ChunkReplica::aioPrepareRead(chunkStore_, job);
|
||||
}
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::aioFinishRead(AioReadJob &job) {
|
||||
if (job.state().chunkEngineJob.has_chunk()) {
|
||||
return Void{};
|
||||
}
|
||||
return ChunkReplica::aioFinishRead(chunkStore_, job);
|
||||
}
|
||||
|
||||
// update chunk (write/remove/truncate).
|
||||
void StorageTarget::updateChunk(UpdateJob &job, folly::CPUThreadPoolExecutor &executor) {
|
||||
if (job.type() == UpdateType::COMMIT) {
|
||||
if (useChunkEngine()) {
|
||||
job.setResult(ChunkEngine::commit(*engine_, job, config_.kv_store().sync_when_write()));
|
||||
} else {
|
||||
job.setResult(ChunkReplica::commit(chunkStore_, job));
|
||||
}
|
||||
} else {
|
||||
auto result =
|
||||
useChunkEngine() ? ChunkEngine::update(*engine_, job) : ChunkReplica::update(chunkStore_, job, executor);
|
||||
if (LIKELY(result.hasValue())) {
|
||||
if (job.options().isSyncing) {
|
||||
storageUpdateReplace.addSample(1);
|
||||
}
|
||||
if (job.updateIO().isWrite()) {
|
||||
storageWriteTimes.addSample(1);
|
||||
} else if (job.updateIO().isRemove()) {
|
||||
storageRemoveTimes.addSample(1);
|
||||
} else if (job.updateIO().isExtend()) {
|
||||
storageTruncateTimes.addSample(1);
|
||||
}
|
||||
} else {
|
||||
uint32_t code = result.error().code();
|
||||
switch (code) {
|
||||
case StorageCode::kChecksumMismatch:
|
||||
storageUpdateChecksumMismatch.addSample(1);
|
||||
break;
|
||||
|
||||
case StorageCode::kChunkCommittedUpdate:
|
||||
storageUpdateCommitted.addSample(1);
|
||||
break;
|
||||
|
||||
case StorageCode::kChunkStaleUpdate:
|
||||
storageUpdateStale.addSample(1);
|
||||
break;
|
||||
|
||||
case StorageCode::kChunkMissingUpdate:
|
||||
storageUpdateMissing.addSample(1);
|
||||
break;
|
||||
|
||||
case StorageCode::kChunkAdvanceUpdate:
|
||||
storageUpdateAdvance.addSample(1);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
job.setResult(std::move(result));
|
||||
}
|
||||
}
|
||||
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> StorageTarget::queryChunks(const ChunkIdRange &chunkIdRange) {
|
||||
auto pointQueryStrategy = config_.point_query_strategy();
|
||||
if ((pointQueryStrategy == PointQueryStrategy::CLASSIC && chunkIdRange.begin.nextChunkId() == chunkIdRange.end) ||
|
||||
(pointQueryStrategy == PointQueryStrategy::MODERN &&
|
||||
chunkIdRange.begin.rangeEndForCurrentChunk() == chunkIdRange.end)) {
|
||||
auto reportGuard = pointQueryRecorder.record();
|
||||
auto result = queryChunk(chunkIdRange.begin);
|
||||
if (result.hasValue()) {
|
||||
reportGuard.succ();
|
||||
return std::vector<std::pair<ChunkId, ChunkMetadata>>(1, std::make_pair(chunkIdRange.begin, *result));
|
||||
} else if (result.error().code() == StorageCode::kChunkMetadataNotFound) {
|
||||
reportGuard.succ();
|
||||
return std::vector<std::pair<ChunkId, ChunkMetadata>>{};
|
||||
} else {
|
||||
return makeError(std::move(result.error()));
|
||||
}
|
||||
}
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::queryChunks(*engine_, chunkIdRange, chainId());
|
||||
}
|
||||
return chunkStore_.queryChunks(chunkIdRange);
|
||||
}
|
||||
|
||||
Result<ChunkMetadata> StorageTarget::queryChunk(const ChunkId &chunkId) {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::queryChunk(*engine_, chunkId, chainId());
|
||||
}
|
||||
auto getResult = chunkStore_.get(chunkId);
|
||||
RETURN_AND_LOG_ON_ERROR(getResult);
|
||||
return (*getResult)->second.meta;
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::reportUnrecycledSize() {
|
||||
targetUsedSize_->set(usedSize());
|
||||
|
||||
if (useChunkEngine()) {
|
||||
return Void{};
|
||||
}
|
||||
|
||||
int64_t reseredSize = 0;
|
||||
int64_t unrecycledSize = 0;
|
||||
auto result = chunkStore_.unusedSize(reseredSize, unrecycledSize);
|
||||
if (UNLIKELY(!result)) {
|
||||
targetReservedSize_->set(-1);
|
||||
targetUnrecycledSize_->set(-1);
|
||||
XLOGF(ERR, "target get unused size failed, {}, error: {}", targetConfig_.target_id, result.error());
|
||||
} else {
|
||||
unusedSize_ = reseredSize + unrecycledSize;
|
||||
targetReservedSize_->set(reseredSize);
|
||||
targetUnrecycledSize_->set(unrecycledSize);
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::getAllMetadata(ChunkMetaVector &metadataVec) {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::getAllMetadata(*engine_, chainId(), metadataVec);
|
||||
} else {
|
||||
return chunkStore_.getAllMetadata(metadataVec);
|
||||
}
|
||||
}
|
||||
|
||||
Result<Void> StorageTarget::getAllMetadataMap(std::unordered_map<ChunkId, ChunkMetadata> &metas) {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::getAllMetadataMap(*engine_, metas, chainId());
|
||||
} else {
|
||||
auto iteratorResult = chunkStore_.metaIterator();
|
||||
RETURN_AND_LOG_ON_ERROR(iteratorResult);
|
||||
for (auto &it = *iteratorResult; it.valid(); it.next()) {
|
||||
auto chunkId = it.chunkId();
|
||||
auto metaResult = it.meta();
|
||||
if (UNLIKELY(!metaResult)) {
|
||||
auto msg = fmt::format("storage target dump parse meta failed: {}, chunk {}", metaResult.error(), chunkId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
metas[it.chunkId()] = *metaResult;
|
||||
}
|
||||
return iteratorResult->status();
|
||||
}
|
||||
}
|
||||
|
||||
void StorageTarget::recordRealRead(uint32_t bytes, Duration latency) const {
|
||||
readSuccBytesPerDisk_->addSample(bytes);
|
||||
readSuccLatencyPerDisk_->addSample(latency);
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
188
src/storage/store/StorageTarget.h
Normal file
188
src/storage/store/StorageTarget.h
Normal file
@@ -0,0 +1,188 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/Synchronized.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/CoLockManager.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/LockManager.h"
|
||||
#include "common/utils/Path.h"
|
||||
#include "storage/aio/BatchReadJob.h"
|
||||
#include "storage/store/ChunkEngine.h"
|
||||
#include "storage/store/ChunkStore.h"
|
||||
#include "storage/store/PhysicalConfig.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageTarget : public enable_shared_from_this<StorageTarget> {
|
||||
protected:
|
||||
StorageTarget(const ChunkStore::Config &config,
|
||||
GlobalFileStore &globalFileStore,
|
||||
uint32_t diskIndex,
|
||||
chunk_engine::Engine *engine);
|
||||
|
||||
public:
|
||||
using Config = ChunkStore::Config;
|
||||
|
||||
~StorageTarget();
|
||||
|
||||
// create storage target.
|
||||
Result<Void> create(const PhysicalConfig &config);
|
||||
|
||||
// load storage target.
|
||||
Result<Void> load(const Path &path);
|
||||
|
||||
// add new chunk size.
|
||||
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
|
||||
|
||||
// get target id. [guaranteed loaded]
|
||||
TargetId targetId() const { return TargetId{targetConfig_.target_id}; }
|
||||
|
||||
// get chain id. [guaranteed loaded]
|
||||
ChainId chainId() const { return ChainId{targetConfig_.chain_id}; }
|
||||
|
||||
// set chain id.
|
||||
Result<Void> setChainId(ChainId chainId);
|
||||
|
||||
// get disk index.
|
||||
uint32_t diskIndex() const { return diskIndex_; }
|
||||
|
||||
// get target path. [guaranteed loaded]
|
||||
Path path() const { return targetConfig_.path; }
|
||||
|
||||
// get all chunk metadata
|
||||
Result<Void> getAllMetadata(ChunkMetaVector &metadataVec);
|
||||
Result<Void> getAllMetadataMap(std::unordered_map<ChunkId, ChunkMetadata> &metas);
|
||||
|
||||
// lock chunk.
|
||||
auto lockChunk(folly::coro::Baton &baton, const ChunkId &chunk, const std::string &tag) {
|
||||
return chunkLocks_.lock(baton, chunk.data(), tag);
|
||||
}
|
||||
|
||||
// try lock channel.
|
||||
auto tryLockChannel(folly::coro::Baton &baton, const std::string &key) { return channelLocks_.tryLock(baton, key); }
|
||||
|
||||
// prepare aio read.
|
||||
Result<Void> aioPrepareRead(AioReadJob &job);
|
||||
|
||||
// finish aio read.
|
||||
Result<Void> aioFinishRead(AioReadJob &job);
|
||||
|
||||
// update chunk (write/remove/truncate).
|
||||
void updateChunk(UpdateJob &job, folly::CPUThreadPoolExecutor &executor);
|
||||
|
||||
// query chunks: the chunk ids in result are in reverse lexicographical order
|
||||
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(const ChunkIdRange &chunkIdRange);
|
||||
|
||||
// query chunk.
|
||||
Result<ChunkMetadata> queryChunk(const ChunkId &chunkId);
|
||||
|
||||
// recycle a batch of chunks. return true if all holes are punched.
|
||||
Result<bool> punchHole() {
|
||||
if (useChunkEngine()) {
|
||||
return true;
|
||||
} else {
|
||||
return chunkStore_.punchHole();
|
||||
}
|
||||
}
|
||||
|
||||
// sync meta kv.
|
||||
Result<Void> sync() {
|
||||
if (useChunkEngine()) {
|
||||
return Void{};
|
||||
} else {
|
||||
return chunkStore_.sync();
|
||||
}
|
||||
}
|
||||
|
||||
// report unrecycled size.
|
||||
Result<Void> reportUnrecycledSize();
|
||||
|
||||
// get used size.
|
||||
uint64_t usedSize() const {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::chainUsedSize(*engine_, ChainId{targetConfig_.chain_id});
|
||||
} else {
|
||||
return chunkStore_.usedSize();
|
||||
}
|
||||
}
|
||||
|
||||
// get unused size.
|
||||
uint64_t unusedSize() const { return unusedSize_; }
|
||||
|
||||
// get all uncommitted chunk ids.
|
||||
Result<std::vector<ChunkId>> uncommitted() {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::queryUncommittedChunks(*engine_, chainId());
|
||||
} else {
|
||||
return chunkStore_.uncommitted();
|
||||
}
|
||||
}
|
||||
|
||||
// reset uncommitted chunk to committed state.
|
||||
Result<Void> resetUncommitted(ChainVer chainVer) {
|
||||
if (useChunkEngine()) {
|
||||
return ChunkEngine::resetUncommittedChunks(*engine_, chainId(), chainVer);
|
||||
} else {
|
||||
return chunkStore_.resetUncommitted(chainVer);
|
||||
}
|
||||
}
|
||||
|
||||
// enable or disable emergency recycling.
|
||||
void setEmergencyRecycling(bool enable) {
|
||||
if (useChunkEngine()) {
|
||||
return;
|
||||
} else {
|
||||
return chunkStore_.setEmergencyRecycling(enable);
|
||||
}
|
||||
}
|
||||
|
||||
// record real read.
|
||||
void recordRealRead(uint32_t bytes, Duration latency) const;
|
||||
|
||||
// disk monitor tag.
|
||||
auto &tag() const { return diskTag_; }
|
||||
|
||||
// check alive or not.
|
||||
std::weak_ptr<bool> aliveWeakPtr() const { return alive_; }
|
||||
|
||||
// global serial number.
|
||||
auto generationId() const { return generationId_; }
|
||||
|
||||
// release self.
|
||||
Result<Void> release() {
|
||||
released_ = true;
|
||||
return sync();
|
||||
}
|
||||
|
||||
// check if chunk engine is used.
|
||||
inline bool useChunkEngine() const { return targetConfig_.only_chunk_engine; }
|
||||
|
||||
private:
|
||||
const Config &config_;
|
||||
std::shared_ptr<bool> alive_ = std::make_shared<bool>();
|
||||
uint32_t diskIndex_;
|
||||
uint32_t generationId_;
|
||||
chunk_engine::Engine *engine_{};
|
||||
std::atomic<uint64_t> unusedSize_{};
|
||||
monitor::TagSet diskTag_;
|
||||
monitor::TagSet targetTag_;
|
||||
monitor::Recorder::TagRef<monitor::CountRecorder> readCountPerDisk_;
|
||||
monitor::Recorder::TagRef<monitor::CountRecorder> readBytesPerDisk_;
|
||||
monitor::Recorder::TagRef<monitor::CountRecorder> readSuccBytesPerDisk_;
|
||||
monitor::Recorder::TagRef<monitor::LatencyRecorder> readSuccLatencyPerDisk_;
|
||||
monitor::Recorder::TagRef<monitor::ValueRecorder> targetUsedSize_;
|
||||
monitor::Recorder::TagRef<monitor::ValueRecorder> targetReservedSize_;
|
||||
monitor::Recorder::TagRef<monitor::ValueRecorder> targetUnrecycledSize_;
|
||||
PhysicalConfig targetConfig_;
|
||||
ChunkStore chunkStore_;
|
||||
CoLockManager<> chunkLocks_;
|
||||
CoLockManager<> channelLocks_;
|
||||
folly::Synchronized<std::set<Size>, std::mutex> chunkSizeList_;
|
||||
bool released_ = false;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
297
src/storage/store/StorageTargets.cc
Normal file
297
src/storage/store/StorageTargets.cc
Normal file
@@ -0,0 +1,297 @@
|
||||
#include "storage/store/StorageTargets.h"
|
||||
|
||||
#include <boost/filesystem/operations.hpp>
|
||||
#include <folly/experimental/coro/BlockingWait.h>
|
||||
#include <folly/experimental/coro/Collect.h>
|
||||
#include <folly/experimental/coro/Sleep.h>
|
||||
#include <folly/experimental/coro/Task.h>
|
||||
#include <memory>
|
||||
#include <sys/statvfs.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "common/monitor/Sample.h"
|
||||
#include "common/utils/CPUExecutorGroup.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/LogCommands.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/SysResource.h"
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
StorageTargets::~StorageTargets() { void(); }
|
||||
|
||||
Result<Void> StorageTargets::init(CPUExecutorGroup &executor) {
|
||||
auto diskInfoResult = SysResource::scanDiskInfo();
|
||||
RETURN_AND_LOG_ON_ERROR(diskInfoResult);
|
||||
std::unordered_map<uint32_t, std::string> deviceIdToManufacturer;
|
||||
for (auto &info : *diskInfoResult) {
|
||||
deviceIdToManufacturer[info.deviceId] = info.manufacturer;
|
||||
}
|
||||
|
||||
targetPaths_ = config_.target_paths();
|
||||
for (auto &path : targetPaths_) {
|
||||
struct stat st;
|
||||
int succ = ::stat(path.c_str(), &st);
|
||||
if (succ != 0) {
|
||||
auto msg = fmt::format("stat {} failed: {}", path, errno);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageStatFailed, std::move(msg));
|
||||
}
|
||||
manufacturers_.push_back(deviceIdToManufacturer[st.st_dev]);
|
||||
}
|
||||
|
||||
uint32_t i = 0;
|
||||
for (auto &path : targetPaths_) {
|
||||
pathToDiskIndex_[path] = i++;
|
||||
}
|
||||
|
||||
std::vector<folly::coro::TaskWithExecutor<Result<rust::Box<chunk_engine::Engine>>>> tasks;
|
||||
for (auto &path : targetPaths_) {
|
||||
auto engine_path = path / "engine";
|
||||
bool create = !boost::filesystem::exists(engine_path);
|
||||
create |= config_.create_engine_path();
|
||||
tasks.push_back(folly::coro::co_invoke([engine_path, create]() -> CoTryTask<rust::Box<chunk_engine::Engine>> {
|
||||
std::string error;
|
||||
auto engine = chunk_engine::create(engine_path.c_str(), create, sizeof(ChainId), error);
|
||||
if (!error.empty()) {
|
||||
co_return makeError(StorageCode::kStorageStatFailed, std::move(error));
|
||||
}
|
||||
co_return engine;
|
||||
}).scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
|
||||
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
for (auto &result : results) {
|
||||
RETURN_AND_LOG_ON_ERROR(result);
|
||||
engines_.push_back(std::move(result.value()));
|
||||
}
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTargets::create(const CreateConfig &createConfig) {
|
||||
CPUExecutorGroup executor(1, "Creator");
|
||||
RETURN_AND_LOG_ON_ERROR(init(executor));
|
||||
auto targetPaths = config_.target_paths();
|
||||
auto targetNumPerPath = config_.target_num_per_path();
|
||||
auto targetIdSize = createConfig.target_ids().size();
|
||||
if (targetPaths.empty()) {
|
||||
auto msg = fmt::format("List of target path is empty");
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (targetNumPerPath == 0) {
|
||||
auto msg = fmt::format("Target num per path is 0!");
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (targetPaths.size() * targetNumPerPath != targetIdSize) {
|
||||
auto msg = fmt::format("Unable to arrange target. path size {}, target num per path {}, target id size {}",
|
||||
targetPaths.size(),
|
||||
targetNumPerPath,
|
||||
targetIdSize);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, msg);
|
||||
}
|
||||
|
||||
size_t idx = 0;
|
||||
for (auto &targetId : createConfig.target_ids()) {
|
||||
auto diskIndex = idx / targetNumPerPath;
|
||||
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
|
||||
globalFileStore_,
|
||||
diskIndex,
|
||||
&*engines_[diskIndex]);
|
||||
PhysicalConfig targetConfig;
|
||||
targetConfig.path = targetPaths[diskIndex] / std::to_string(targetId);
|
||||
targetConfig.target_id = targetId;
|
||||
targetConfig.allow_disk_without_uuid = createConfig.allow_disk_without_uuid();
|
||||
targetConfig.allow_existing_targets = createConfig.allow_existing_targets();
|
||||
targetConfig.physical_file_count = createConfig.physical_file_count();
|
||||
targetConfig.chunk_size_list = createConfig.chunk_size_list();
|
||||
targetConfig.only_chunk_engine = createConfig.only_chunk_engine();
|
||||
RETURN_AND_LOG_ON_ERROR(storageTarget->create(targetConfig));
|
||||
++idx;
|
||||
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTargets::create(const CreateTargetReq &req) {
|
||||
if (req.diskIndex >= config_.target_paths().size()) {
|
||||
auto msg = fmt::format("disk index exceed {} >= {}", req.diskIndex, config_.target_paths().size());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (req.chainId == ChainId{}) {
|
||||
auto msg = fmt::format("target {} without chain id", req.targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
folly::coro::Baton baton;
|
||||
auto lock = targetLocks_.lock(baton, fmt::to_string(req.chainId));
|
||||
if (!lock.locked()) {
|
||||
folly::coro::blockingWait(lock.lock());
|
||||
}
|
||||
if (auto existingTarget = targetMap_.snapshot()->getByChainId(VersionedChainId{req.chainId, {}}, true)) {
|
||||
auto existingTargetId = (*existingTarget)->targetId;
|
||||
if (existingTargetId != req.targetId) {
|
||||
auto msg = fmt::format("target {} is existing with same chain id {}, req target {}",
|
||||
existingTargetId,
|
||||
req.chainId,
|
||||
req.targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (req.addChunkSize) {
|
||||
RETURN_AND_LOG_ON_ERROR((*existingTarget)->storageTarget->addChunkSize(req.chunkSizeList));
|
||||
}
|
||||
} else if (req.addChunkSize) {
|
||||
auto msg = fmt::format("target {} {} is not existing", req.chainId, req.targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
if (targetMap_.snapshot()->getTarget(req.targetId)) {
|
||||
if (req.allowExistingTarget) {
|
||||
auto targetPath = config_.target_paths()[req.diskIndex] / std::to_string(req.targetId);
|
||||
if (!boost::filesystem::exists(targetPath)) {
|
||||
auto msg = fmt::format("target {} is existing in memory, but not found in disk", req.targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
XLOGF(INFO, "target {} is already existing, return succ", req.targetId);
|
||||
return Void{};
|
||||
} else {
|
||||
auto msg = fmt::format("target {} is already existing", req.targetId);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
|
||||
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
|
||||
globalFileStore_,
|
||||
req.diskIndex,
|
||||
&*engines_[req.diskIndex]);
|
||||
PhysicalConfig targetConfig;
|
||||
auto targetPath = config_.target_paths()[req.diskIndex] / std::to_string(req.targetId);
|
||||
targetConfig.path = targetPath;
|
||||
targetConfig.target_id = req.targetId;
|
||||
targetConfig.chain_id = req.chainId;
|
||||
targetConfig.allow_disk_without_uuid = config_.allow_disk_without_uuid();
|
||||
targetConfig.allow_existing_targets = req.allowExistingTarget;
|
||||
targetConfig.physical_file_count = req.physicalFileCount;
|
||||
targetConfig.chunk_size_list = req.chunkSizeList;
|
||||
targetConfig.kv_store_type = config_.storage_target().kv_store().type();
|
||||
targetConfig.only_chunk_engine = req.onlyChunkEngine;
|
||||
RETURN_AND_LOG_ON_ERROR(storageTarget->create(targetConfig));
|
||||
XLOGF(INFO, "Create storage target {} at {}", storageTarget->targetId(), targetPath.string());
|
||||
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> StorageTargets::load(CPUExecutorGroup &executor) {
|
||||
RETURN_AND_LOG_ON_ERROR(init(executor));
|
||||
std::vector<folly::coro::TaskWithExecutor<Result<Void>>> tasks;
|
||||
for (auto &parentPath : config_.target_paths()) {
|
||||
auto writable = CheckWorker::checkWritable(parentPath);
|
||||
if (!writable) {
|
||||
XLOGF(DFATAL, "path {} isn't writable, skip it", parentPath);
|
||||
}
|
||||
for (auto &targetPath : boost::filesystem::directory_iterator(parentPath)) {
|
||||
auto targetConfigPath = targetPath / kPhysicalConfigFileName;
|
||||
if (boost::filesystem::is_directory(targetPath) && boost::filesystem::is_regular_file(targetConfigPath)) {
|
||||
tasks.push_back(folly::coro::co_invoke([this, targetPath]() -> CoTryTask<Void> {
|
||||
co_return loadTarget(targetPath);
|
||||
}).scheduleOn(&executor.pickNext()));
|
||||
}
|
||||
}
|
||||
}
|
||||
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
|
||||
for (auto &result : results) {
|
||||
RETURN_AND_LOG_ON_ERROR(result);
|
||||
}
|
||||
if (config_.collect_all_fds()) {
|
||||
globalFileStore_.collect(fds_);
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// load a target.
|
||||
Result<Void> StorageTargets::loadTarget(const Path &targetPath) {
|
||||
auto diskPath = targetPath.parent_path();
|
||||
if (UNLIKELY(!pathToDiskIndex_.contains(diskPath))) {
|
||||
auto msg = fmt::format("Target path ({}) not belongs to any of disk paths", targetPath);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
auto diskIndex = pathToDiskIndex_[diskPath];
|
||||
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
|
||||
globalFileStore_,
|
||||
diskIndex,
|
||||
&*engines_[diskIndex]);
|
||||
RETURN_AND_LOG_ON_ERROR(storageTarget->load(targetPath));
|
||||
XLOGF(INFO, "Load storage target {} at {}", storageTarget->targetId(), targetPath.string());
|
||||
auto targetId = storageTarget->targetId();
|
||||
if (UNLIKELY(targetPath.filename().string() != fmt::format("{}", targetId.toUnderType()))) {
|
||||
auto msg = fmt::format("Target id {} and path {} mismatch!", targetId, targetPath);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<std::vector<SpaceInfo>> StorageTargets::spaceInfos(bool force) {
|
||||
folly::coro::Baton baton;
|
||||
auto lock = targetLocks_.lock(baton, "spaceInfos");
|
||||
if (!lock.locked()) {
|
||||
folly::coro::blockingWait(lock.lock());
|
||||
}
|
||||
|
||||
auto now = RelativeTime::now();
|
||||
auto elapsedTime = now - spaceInfoUpdatedTime_;
|
||||
if (elapsedTime < config_.space_info_cache_timeout() && !force) {
|
||||
return cachedSpaceInfos_;
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, uint64_t> diskUnusedSize;
|
||||
std::unordered_map<std::string, std::vector<hf3fs::flat::TargetId>> pathToTargetIds;
|
||||
auto snapshot = targetMap_.snapshot();
|
||||
for (auto &[targetId, target] : snapshot->getTargets()) {
|
||||
pathToTargetIds[target.path.parent_path().string()].emplace_back(targetId);
|
||||
if (target.storageTarget != nullptr) {
|
||||
diskUnusedSize[target.path.parent_path().string()] += target.storageTarget->unusedSize();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<SpaceInfo> ret;
|
||||
for (auto &[path, index] : pathToDiskIndex_) {
|
||||
SpaceInfo info;
|
||||
info.path = targetPaths_[index].string();
|
||||
info.targetIds = pathToTargetIds[info.path];
|
||||
|
||||
boost::system::error_code ec{};
|
||||
auto spaceInfo = boost::filesystem::space(path, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("get space info of directory {} failed: {}", path, ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
|
||||
}
|
||||
auto usedSize = engines_[index]->raw_used_size();
|
||||
info.capacity = spaceInfo.capacity;
|
||||
info.free = spaceInfo.free + diskUnusedSize[info.path] + usedSize.reserved_size;
|
||||
info.available = spaceInfo.available;
|
||||
info.manufacturer = manufacturers_[index];
|
||||
ret.push_back(std::move(info));
|
||||
}
|
||||
cachedSpaceInfos_ = ret;
|
||||
spaceInfoUpdatedTime_ = RelativeTime::now();
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
104
src/storage/store/StorageTargets.h
Normal file
104
src/storage/store/StorageTargets.h
Normal file
@@ -0,0 +1,104 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "common/utils/CPUExecutorGroup.h"
|
||||
#include "common/utils/CoLockManager.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/RobinHood.h"
|
||||
#include "fbs/mgmtd/HeartbeatInfo.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
#include "storage/store/StorageTarget.h"
|
||||
|
||||
namespace hf3fs::test {
|
||||
struct StorageTargetsHelper;
|
||||
}
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageTargets {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(target_paths, std::vector<Path>{}, [](auto &vec) { return !vec.empty(); });
|
||||
CONFIG_ITEM(target_num_per_path, 0u);
|
||||
CONFIG_HOT_UPDATED_ITEM(collect_all_fds, true);
|
||||
CONFIG_HOT_UPDATED_ITEM(space_info_cache_timeout, 5_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(allow_disk_without_uuid, false);
|
||||
CONFIG_HOT_UPDATED_ITEM(create_engine_path, true);
|
||||
CONFIG_OBJ(storage_target, StorageTarget::Config);
|
||||
};
|
||||
|
||||
class CreateConfig : public ConfigBase<CreateConfig> {
|
||||
CONFIG_ITEM(target_ids, std::vector<flat::TargetId::UnderlyingType>{});
|
||||
CONFIG_ITEM(physical_file_count, 256u);
|
||||
CONFIG_ITEM(allow_disk_without_uuid, false);
|
||||
CONFIG_ITEM(allow_existing_targets, false);
|
||||
CONFIG_ITEM(chunk_size_list, (std::vector<Size>{512_KB, 1_MB, 2_MB, 4_MB, 16_MB, 64_MB}));
|
||||
CONFIG_ITEM(only_chunk_engine, false);
|
||||
};
|
||||
|
||||
StorageTargets(const Config &config, AtomicallyTargetMap &targetMap)
|
||||
: config_(config),
|
||||
targetMap_(targetMap) {}
|
||||
~StorageTargets();
|
||||
|
||||
Result<Void> init(CPUExecutorGroup &executor);
|
||||
|
||||
// create a batch of storage targets.
|
||||
Result<Void> create(const CreateConfig &createConfig);
|
||||
|
||||
// create new storage target.
|
||||
Result<Void> create(const CreateTargetReq &req);
|
||||
|
||||
// open a batch of storage targets.
|
||||
Result<Void> load(CPUExecutorGroup &executor);
|
||||
|
||||
// load a target.
|
||||
Result<Void> loadTarget(const Path &targetPath);
|
||||
|
||||
// get fd list.
|
||||
auto &fds() const { return fds_; }
|
||||
|
||||
// get space info.
|
||||
Result<std::vector<SpaceInfo>> spaceInfos(bool force);
|
||||
|
||||
// get target paths.
|
||||
auto &targetPaths() const { return targetPaths_; }
|
||||
|
||||
// get manufacturers.
|
||||
auto &manufacturers() const { return manufacturers_; }
|
||||
|
||||
// global file store.
|
||||
auto &globalFileStore() { return globalFileStore_; }
|
||||
|
||||
// chunk engines.
|
||||
auto &engines() const { return engines_; }
|
||||
|
||||
// remove target.
|
||||
Result<Void> removeChunkEngineTarget(ChainId chainId, uint32_t diskIndex) {
|
||||
auto &engine = *engines_[diskIndex];
|
||||
return ChunkEngine::removeAllChunks(engine, chainId);
|
||||
}
|
||||
|
||||
private:
|
||||
friend struct test::StorageTargetsHelper;
|
||||
ConstructLog<"storage::StorageTargets"> constructLog_;
|
||||
const Config &config_;
|
||||
AtomicallyTargetMap &targetMap_;
|
||||
GlobalFileStore globalFileStore_;
|
||||
|
||||
std::vector<Path> targetPaths_;
|
||||
std::vector<std::string> manufacturers_;
|
||||
std::map<Path, uint32_t> pathToDiskIndex_;
|
||||
std::vector<rust::Box<chunk_engine::Engine>> engines_;
|
||||
|
||||
CoLockManager<> targetLocks_;
|
||||
RelativeTime spaceInfoUpdatedTime_;
|
||||
std::vector<SpaceInfo> cachedSpaceInfos_;
|
||||
|
||||
std::vector<int> fds_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
460
src/storage/sync/ResyncWorker.cc
Normal file
460
src/storage/sync/ResyncWorker.cc
Normal file
@@ -0,0 +1,460 @@
|
||||
#include "storage/sync/ResyncWorker.h"
|
||||
|
||||
#include <folly/ScopeGuard.h>
|
||||
#include <folly/experimental/coro/Collect.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/Components.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::OperationRecorder resyncRecorder{"storage.resync"};
|
||||
monitor::CountRecorder resyncRoutingVersionMismatch{"storage.resync.routing_version_mismatch"};
|
||||
monitor::OperationRecorder syncingWriteRecorder{"storage.syncing.write_count"};
|
||||
monitor::OperationRecorder syncingRemoveRecorder{"storage.syncing.remove_count"};
|
||||
monitor::CountRecorder syncingSkipCount{"storage.syncing.skip_count"};
|
||||
monitor::CountRecorder syncingRemoteMissCount{"storage.syncing.remote_miss_count"};
|
||||
monitor::CountRecorder syncingRemoteChainVersionLowCount{"storage.syncing.chain_version_low"};
|
||||
monitor::CountRecorder syncingRemoteChainVersionHighCount{"storage.syncing.chain_version_high"};
|
||||
monitor::CountRecorder syncingLocalUncommittedCount{"storage.syncing.local_uncommitted"};
|
||||
monitor::CountRecorder syncingCommitVersionMismatchCount{"storage.syncing.commit_version_mismatch"};
|
||||
monitor::CountRecorder syncingCurrentChainIsWritingCount{"storage.syncing.current_chain_is_writing"};
|
||||
monitor::CountRecorder syncingRemoteUncommittedCount{"storage.syncing.remote_uncommitted"};
|
||||
monitor::CountRecorder syncingRemoteFullSyncLightCount{"storage.syncing.full_sync_light"};
|
||||
monitor::CountRecorder syncingRemoteFullSyncHeavyCount{"storage.syncing.full_sync_heavy"};
|
||||
monitor::CountRecorder syncingLocalChunkInRecycleState{"storage.syncing.chunk_in_recycle_state"};
|
||||
monitor::CountRecorder syncingSkipRemoveAfterUpdate{"storage.syncing.skip_remove_after_update"};
|
||||
monitor::CountRecorder syncingSkipUpdateAfterRemove{"storage.syncing.skip_update_after_remove"};
|
||||
monitor::ValueRecorder syncingRemainingTargetsCount{"storage.syncing.remaining_targets_count", std::nullopt, false};
|
||||
monitor::ValueRecorder syncingRemainingChunksCount{"storage.syncing.remaining_chunks_count", std::nullopt, false};
|
||||
|
||||
} // namespace
|
||||
|
||||
ResyncWorker::ResyncWorker(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
|
||||
std::make_shared<folly::NamedThreadFactory>("Sync")),
|
||||
pool_(config_.pool(), &executors_),
|
||||
updateChannelAllocator_(config_.num_channels()),
|
||||
batchConcurrencyLimiter_(config_.batch_concurrency_limiter()) {}
|
||||
|
||||
Result<Void> ResyncWorker::start() {
|
||||
RETURN_AND_LOG_ON_ERROR(
|
||||
pool_.start([this](VersionedChainId vChainId) -> CoTask<void> { co_await handleSync(vChainId); }));
|
||||
executors_.add([this] { loop(); });
|
||||
started_ = true;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> ResyncWorker::stopAndJoin() {
|
||||
stopping_ = true;
|
||||
cond_.notify_one();
|
||||
for (int i = 0; started_ && !stopped_; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for ResyncWorker@{}::loop stop...", fmt::ptr(this));
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
pool_.stopAndJoin();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void ResyncWorker::loop() {
|
||||
while (!stopping_) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
if (cond_.wait_for(lock, 500_ms, [&] { return stopping_.load(); })) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto syncingChains = components_.targetMap.snapshot()->syncingChains();
|
||||
syncingRemainingTargetsCount.set(syncingChains.size());
|
||||
std::shuffle(syncingChains.begin(), syncingChains.end(), std::mt19937{std::random_device{}()});
|
||||
for (auto &vChainId : syncingChains) {
|
||||
if (stopping_) {
|
||||
break;
|
||||
}
|
||||
bool succ = shards_.withLock(
|
||||
[vChainId](SyncingChainIds &syncingChainIds) {
|
||||
auto &status = syncingChainIds[vChainId.chainId];
|
||||
if (!status.isSyncing && RelativeTime::now() - status.lastSyncingTime > 30_s) {
|
||||
status.isSyncing = true;
|
||||
return true;
|
||||
} else {
|
||||
XLOGF(DBG, "chain id {} is syncing", vChainId.chainId);
|
||||
return false;
|
||||
}
|
||||
},
|
||||
vChainId.chainId);
|
||||
if (succ) {
|
||||
pool_.enqueueSync(vChainId);
|
||||
}
|
||||
}
|
||||
}
|
||||
stopped_ = true;
|
||||
XLOGF(INFO, "ResyncWorker@{}::loop stopped", fmt::ptr(this));
|
||||
}
|
||||
|
||||
CoTryTask<void> ResyncWorker::handleSync(VersionedChainId vChainId) {
|
||||
auto fullSyncLevel = config_.full_sync_level();
|
||||
auto needFullSync = fullSyncLevel != FullSyncLevel::NONE &&
|
||||
(config_.full_sync_chains().empty() || config_.full_sync_chains().contains(vChainId.chainId));
|
||||
bool heavyFullSync = needFullSync && fullSyncLevel == FullSyncLevel::HEAVY;
|
||||
|
||||
// 1. Cancel the syncing state on exit.
|
||||
auto guard = folly::makeGuard([&] {
|
||||
shards_.withLock(
|
||||
[&](SyncingChainIds &syncingChainIds) {
|
||||
XLOGF(DBG9, "sync exit chain {}", vChainId);
|
||||
auto &status = syncingChainIds[vChainId.chainId];
|
||||
status.isSyncing = false;
|
||||
status.lastSyncingTime = RelativeTime::now();
|
||||
},
|
||||
vChainId.chainId);
|
||||
});
|
||||
XLOGF(DBG9, "start sync chain {}", vChainId);
|
||||
|
||||
// 2. find target and routing.
|
||||
auto targetResult = components_.targetMap.getByChainId(vChainId);
|
||||
if (UNLIKELY(!targetResult)) {
|
||||
auto msg = fmt::format("sync start {} get routing failed: {}", vChainId, targetResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
auto target = std::move(*targetResult);
|
||||
auto targetId = target->targetId;
|
||||
|
||||
ClientId clientId{};
|
||||
static_assert(sizeof(ClientId::uuid) == sizeof(VersionedChainId) + sizeof(TargetId));
|
||||
*reinterpret_cast<VersionedChainId *>(clientId.uuid.data) = vChainId;
|
||||
*reinterpret_cast<TargetId *>(clientId.uuid.data + sizeof(VersionedChainId)) = targetId;
|
||||
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", fmt::format("{}-{}", targetId, vChainId.chainVer));
|
||||
uint32_t currentSyncingRemoteMissCount = 0;
|
||||
uint32_t currentSyncingRemoteChainVersionLowCount = 0;
|
||||
uint32_t currentSyncingRemoteChainVersionHighCount = 0;
|
||||
uint32_t currentSyncingRemoteUncommittedCount = 0;
|
||||
uint32_t currentSyncingLocalUncommittedCount = 0;
|
||||
uint32_t currentSyncingCommitVersionMismatchCount = 0;
|
||||
uint32_t currentSyncingCurrentChainIsWritingCount = 0;
|
||||
uint32_t currentSyncingRemoteFullSyncHeavyCount = 0;
|
||||
uint32_t currentSyncingRemoteFullSyncLightCount = 0;
|
||||
uint32_t currentSyncingSkipCount = 0;
|
||||
auto recordGuard = resyncRecorder.record(tag);
|
||||
|
||||
auto remainingChunksCount = syncingRemainingChunksCount.getRecoderWithTag(tag);
|
||||
SCOPE_EXIT { remainingChunksCount->set(0); };
|
||||
|
||||
// 3. sync start.
|
||||
net::UserRequestOptions options;
|
||||
options.timeout = config_.sync_start_timeout();
|
||||
std::vector<ChunkMeta> remoteMetas;
|
||||
|
||||
auto addrResult = target->getSuccessorAddr();
|
||||
if (UNLIKELY(!addrResult)) {
|
||||
XLOGF(ERR, "sync start get successor addr error: {}", addrResult.error());
|
||||
co_return makeError(std::move(addrResult.error()));
|
||||
}
|
||||
{
|
||||
SyncStartReq syncStartReq;
|
||||
syncStartReq.vChainId = vChainId;
|
||||
|
||||
auto syncStartResult = co_await components_.messenger.syncStart(*addrResult, syncStartReq, &options);
|
||||
if (UNLIKELY(!syncStartResult)) {
|
||||
if (syncStartResult.error().code() == StorageClientCode::kRoutingVersionMismatch) {
|
||||
recordGuard.dismiss();
|
||||
resyncRoutingVersionMismatch.addSample(1);
|
||||
auto msg = fmt::format("sync start {} request failed: {}", vChainId, syncStartResult.error());
|
||||
XLOG(DBG9, msg);
|
||||
co_return makeError(std::move(syncStartResult.error()));
|
||||
}
|
||||
auto msg = fmt::format("sync start {} request failed: {}", vChainId, syncStartResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
|
||||
remoteMetas = std::move(syncStartResult->metas);
|
||||
}
|
||||
|
||||
// 3. syncing.
|
||||
std::unordered_map<ChunkId, ChunkMetadata> localMetas;
|
||||
auto result = target->storageTarget->getAllMetadataMap(localMetas);
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "target invalid iterator {}, error {}", targetId, result.error());
|
||||
co_return makeError(std::move(result.error()));
|
||||
}
|
||||
// re-check current chain version.
|
||||
{
|
||||
auto targetResult = components_.targetMap.getByChainId(vChainId);
|
||||
if (UNLIKELY(!targetResult)) {
|
||||
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
std::vector<std::pair<ChunkId, uint32_t>> writeList;
|
||||
std::vector<ChunkId> removeList;
|
||||
|
||||
bool hasFatalEvents = false;
|
||||
for (auto &remoteMeta : remoteMetas) {
|
||||
// 1. check exists.
|
||||
auto it = localMetas.find(remoteMeta.chunkId);
|
||||
if (it == localMetas.end()) {
|
||||
removeList.push_back(remoteMeta.chunkId);
|
||||
continue;
|
||||
}
|
||||
SCOPE_EXIT { localMetas.erase(it); };
|
||||
|
||||
// 2. check recycle state.
|
||||
const auto &chunkId = it->first;
|
||||
const auto &meta = it->second;
|
||||
if (UNLIKELY(meta.recycleState != RecycleState::NORMAL)) {
|
||||
XLOGF(WARNING, "target {} chunk {} in recycle state: {}", targetId, chunkId, meta);
|
||||
syncingLocalChunkInRecycleState.addSample(1);
|
||||
continue; // skip chunk in recycle state.
|
||||
}
|
||||
|
||||
// 3. handle updated write (local == remote).
|
||||
bool needForward = true;
|
||||
if (meta.chainVer > remoteMeta.chainVer) {
|
||||
++currentSyncingRemoteChainVersionLowCount;
|
||||
} else if (remoteMeta.updateVer != remoteMeta.commitVer || remoteMeta.chunkState != ChunkState::COMMIT) {
|
||||
XLOGF(WARNING, "chain {} remote uncommitted {}", vChainId.chainId, remoteMeta);
|
||||
++currentSyncingRemoteUncommittedCount;
|
||||
} else if (meta.chainVer < remoteMeta.chainVer) {
|
||||
if (meta.chunkState == ChunkState::COMMIT) {
|
||||
++currentSyncingRemoteChainVersionHighCount;
|
||||
XLOGF(DFATAL, "chain {} remote chain version high, local {}, remote {}", vChainId, meta, remoteMeta);
|
||||
hasFatalEvents = true;
|
||||
break;
|
||||
} else {
|
||||
needForward = false;
|
||||
++currentSyncingLocalUncommittedCount;
|
||||
XLOGF(CRITICAL, "chain {} local uncommitted, local {}, remote {}", vChainId, meta, remoteMeta);
|
||||
}
|
||||
} else if (meta.updateVer != remoteMeta.commitVer) {
|
||||
if (meta.chainVer != vChainId.chainVer && meta.chunkState == ChunkState::COMMIT) {
|
||||
++currentSyncingCommitVersionMismatchCount;
|
||||
XLOGF(DFATAL, "chain {} commit version mismatch, local {}, remote {}", vChainId, meta, remoteMeta);
|
||||
hasFatalEvents = true;
|
||||
break;
|
||||
} else {
|
||||
needForward = false;
|
||||
++currentSyncingCurrentChainIsWritingCount;
|
||||
XLOGF(CRITICAL, "chain {} chain is writing, local {}, remote {}", vChainId, meta, remoteMeta);
|
||||
}
|
||||
} else if (heavyFullSync) {
|
||||
++currentSyncingRemoteFullSyncHeavyCount;
|
||||
} else if (meta.checksum() != remoteMeta.checksum) {
|
||||
if (meta.chainVer != vChainId.chainVer) {
|
||||
XLOGF(DFATAL, "chain {} checksum not equal, local {}, remote {}", vChainId, meta, remoteMeta);
|
||||
++currentSyncingRemoteFullSyncLightCount;
|
||||
hasFatalEvents = true;
|
||||
break;
|
||||
} else {
|
||||
needForward = false;
|
||||
++currentSyncingCurrentChainIsWritingCount;
|
||||
XLOGF(CRITICAL,
|
||||
"chain {} checksum not equal because of writing, local {}, remote {}",
|
||||
vChainId,
|
||||
meta,
|
||||
remoteMeta);
|
||||
}
|
||||
} else {
|
||||
needForward = false;
|
||||
}
|
||||
if (needForward) {
|
||||
writeList.emplace_back(chunkId, meta.innerFileId.chunkSize);
|
||||
} else {
|
||||
++currentSyncingSkipCount;
|
||||
}
|
||||
}
|
||||
|
||||
if (UNLIKELY(hasFatalEvents)) {
|
||||
auto msg = fmt::format("sync {} has fatal events", vChainId);
|
||||
XLOG(CRITICAL, msg);
|
||||
|
||||
OfflineTargetReq req;
|
||||
req.targetId = targetId;
|
||||
req.force = true;
|
||||
CO_RETURN_AND_LOG_ON_ERROR(co_await components_.messenger.offlineTarget(*addrResult, req, &options));
|
||||
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
|
||||
for (auto &[chunkId, meta] : localMetas) {
|
||||
writeList.emplace_back(chunkId, meta.innerFileId.chunkSize);
|
||||
++currentSyncingRemoteMissCount;
|
||||
}
|
||||
|
||||
syncingRemoteMissCount.addSample(currentSyncingRemoteMissCount, tag);
|
||||
syncingRemoteChainVersionLowCount.addSample(currentSyncingRemoteChainVersionLowCount, tag);
|
||||
syncingRemoteChainVersionHighCount.addSample(currentSyncingRemoteChainVersionHighCount, tag);
|
||||
syncingLocalUncommittedCount.addSample(currentSyncingLocalUncommittedCount, tag);
|
||||
syncingRemoteUncommittedCount.addSample(currentSyncingRemoteUncommittedCount, tag);
|
||||
syncingCommitVersionMismatchCount.addSample(currentSyncingCommitVersionMismatchCount, tag);
|
||||
syncingCurrentChainIsWritingCount.addSample(currentSyncingCurrentChainIsWritingCount, tag);
|
||||
syncingRemoteFullSyncHeavyCount.addSample(currentSyncingRemoteFullSyncHeavyCount, tag);
|
||||
syncingRemoteFullSyncLightCount.addSample(currentSyncingRemoteFullSyncLightCount, tag);
|
||||
syncingSkipCount.addSample(currentSyncingSkipCount, tag);
|
||||
|
||||
auto batchSize = config_.batch_size();
|
||||
auto remainingCount = writeList.size() + removeList.size();
|
||||
remainingChunksCount->set(remainingCount);
|
||||
for (auto batchStart = 0ul; batchStart < removeList.size(); batchStart += batchSize) {
|
||||
auto targetResult = components_.targetMap.getByChainId(vChainId);
|
||||
if (UNLIKELY(!targetResult)) {
|
||||
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
target = std::move(*targetResult);
|
||||
std::vector<CoTryTask<void>> batch;
|
||||
for (auto idx = batchStart; idx < removeList.size() && idx < batchStart + batchSize; ++idx) {
|
||||
batch.push_back(forward(target, tag, clientId, std::move(removeList[idx]), UpdateType::REMOVE, 0));
|
||||
}
|
||||
auto guard = batchConcurrencyLimiter_.lock(0);
|
||||
auto results = co_await folly::coro::collectAllRange(std::move(batch));
|
||||
for (auto &result : results) {
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "target {} forward remove failed {}", targetId, result.error());
|
||||
CO_RETURN_ERROR(result);
|
||||
}
|
||||
}
|
||||
remainingCount -= results.size();
|
||||
remainingChunksCount->set(remainingCount);
|
||||
}
|
||||
for (auto batchStart = 0ul; batchStart < writeList.size(); batchStart += batchSize) {
|
||||
auto targetResult = components_.targetMap.getByChainId(vChainId);
|
||||
if (UNLIKELY(!targetResult)) {
|
||||
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
|
||||
}
|
||||
target = std::move(*targetResult);
|
||||
std::vector<CoTryTask<void>> batch;
|
||||
for (auto idx = batchStart; idx < writeList.size() && idx < batchStart + batchSize; ++idx) {
|
||||
auto &[chunkId, chunkSize] = writeList[idx];
|
||||
batch.push_back(forward(target, tag, clientId, std::move(chunkId), UpdateType::WRITE, chunkSize));
|
||||
}
|
||||
auto guard = batchConcurrencyLimiter_.lock(0);
|
||||
auto results = co_await folly::coro::collectAllRange(std::move(batch));
|
||||
for (auto &result : results) {
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "target {} forward write failed {}", targetId, result.error());
|
||||
CO_RETURN_ERROR(result);
|
||||
}
|
||||
}
|
||||
remainingCount -= results.size();
|
||||
remainingChunksCount->set(remainingCount);
|
||||
}
|
||||
|
||||
// 4. sync done.
|
||||
{
|
||||
SyncDoneReq syncDoneReq;
|
||||
syncDoneReq.vChainId = vChainId;
|
||||
|
||||
auto addrResult = target->getSuccessorAddr();
|
||||
if (UNLIKELY(!addrResult)) {
|
||||
XLOGF(ERR, "sync start get successor addr error: {}", addrResult.error());
|
||||
co_return makeError(std::move(addrResult.error()));
|
||||
}
|
||||
auto syncDoneResult = co_await components_.messenger.syncDone(*addrResult, syncDoneReq);
|
||||
if (UNLIKELY(!syncDoneResult)) {
|
||||
auto msg = fmt::format("sync done {} request failed: {}", vChainId, syncDoneResult.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendDoneFailed, std::move(msg));
|
||||
}
|
||||
if (UNLIKELY(!syncDoneResult->result.lengthInfo)) {
|
||||
auto msg = fmt::format("sync done {} request failed: {}", vChainId, syncDoneResult->result.lengthInfo.error());
|
||||
XLOG(ERR, msg);
|
||||
co_return makeError(StorageCode::kSyncSendDoneFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
|
||||
recordGuard.succ();
|
||||
XLOGF(INFO,
|
||||
"sync done chain {} target {} update {} remove {}",
|
||||
vChainId,
|
||||
targetId,
|
||||
writeList.size(),
|
||||
removeList.size());
|
||||
co_return Void{};
|
||||
}
|
||||
|
||||
CoTryTask<void> ResyncWorker::forward(const TargetPtr &target,
|
||||
const monitor::TagSet &tag,
|
||||
const ClientId &clientId,
|
||||
ChunkId chunkId,
|
||||
UpdateType updateType,
|
||||
uint32_t chunkSize) {
|
||||
auto recordGuard =
|
||||
updateType == UpdateType::REMOVE ? syncingRemoveRecorder.record(tag) : syncingWriteRecorder.record(tag);
|
||||
folly::coro::Baton baton;
|
||||
auto lockGuard = target->storageTarget->lockChunk(baton, chunkId, "sync");
|
||||
if (!lockGuard.locked()) {
|
||||
XLOGF(WARNING, "target {} chunk {} wait lock, current tag: {}", *target, chunkId, lockGuard.currentTag());
|
||||
co_await lockGuard.lock();
|
||||
}
|
||||
|
||||
auto chunkResult = target->storageTarget->queryChunk(chunkId);
|
||||
if (chunkResult) {
|
||||
// chunk exists.
|
||||
if (updateType == UpdateType::REMOVE && chunkResult->recycleState == RecycleState::NORMAL) {
|
||||
XLOGF(WARNING, "target {} chunk {} has been updated, skip remove", *target, chunkId);
|
||||
syncingSkipRemoveAfterUpdate.addSample(1);
|
||||
recordGuard.succ();
|
||||
co_return Void{};
|
||||
}
|
||||
chunkSize = chunkResult->innerFileId.chunkSize; // use latest chunk size.
|
||||
} else if (chunkResult.error().code() == StorageCode::kChunkMetadataNotFound) {
|
||||
// chunk does not exist.
|
||||
if (updateType == UpdateType::WRITE) {
|
||||
XLOGF(WARNING, "target {} chunk {} has been removed, skip updated", *target, chunkId);
|
||||
syncingSkipUpdateAfterRemove.addSample(1);
|
||||
recordGuard.succ();
|
||||
co_return Void{};
|
||||
}
|
||||
} else {
|
||||
co_return makeError(std::move(chunkResult.error()));
|
||||
}
|
||||
|
||||
UpdateChannel channel;
|
||||
if (UNLIKELY(!updateChannelAllocator_.allocate(channel))) {
|
||||
XLOGF(ERR, "no channel to forward sync write");
|
||||
co_return makeError(StorageClientCode::kResourceBusy);
|
||||
}
|
||||
auto channelGuard = folly::makeGuard([&] { updateChannelAllocator_.release(channel); });
|
||||
|
||||
UpdateReq req;
|
||||
req.payload.updateType = updateType;
|
||||
req.payload.key.chunkId = chunkId;
|
||||
req.payload.key.vChainId = target->vChainId;
|
||||
req.payload.offset = 0;
|
||||
req.payload.chunkSize = chunkSize;
|
||||
req.payload.updateVer = ChunkVer{1};
|
||||
req.tag.clientId = clientId;
|
||||
req.payload.checksum.type = ChecksumType::CRC32C;
|
||||
req.tag.requestId = RequestId{++requestId_};
|
||||
req.tag.channel = channel;
|
||||
req.options.fromClient = false;
|
||||
req.options.isSyncing = true;
|
||||
req.options.commitChainVer = target->vChainId.chainVer;
|
||||
|
||||
CommitIO commitIO;
|
||||
TargetPtr t = target;
|
||||
ServiceRequestContext requestCtx{"resync"};
|
||||
ChunkEngineUpdateJob chunkEngineJob;
|
||||
auto forwardResult =
|
||||
co_await components_.reliableForwarding.forwardWithRetry(requestCtx, req, {}, chunkEngineJob, t, commitIO, false);
|
||||
CO_RETURN_ON_ERROR(forwardResult.lengthInfo);
|
||||
|
||||
recordGuard.succ();
|
||||
co_return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
84
src/storage/sync/ResyncWorker.h
Normal file
84
src/storage/sync/ResyncWorker.h
Normal file
@@ -0,0 +1,84 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <mutex>
|
||||
|
||||
#include "client/storage/UpdateChannelAllocator.h"
|
||||
#include "common/serde/Serde.h"
|
||||
#include "common/utils/ConcurrencyLimiter.h"
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "common/utils/CoroutinesPool.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/Shards.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
|
||||
class ResyncWorker {
|
||||
public:
|
||||
enum FullSyncLevel {
|
||||
NONE,
|
||||
HEAVY, // sync all.
|
||||
};
|
||||
struct Config : ConfigBase<Config> {
|
||||
CONFIG_ITEM(num_threads, 16ul);
|
||||
CONFIG_ITEM(num_channels, 1024u);
|
||||
CONFIG_HOT_UPDATED_ITEM(batch_size, 16u);
|
||||
CONFIG_HOT_UPDATED_ITEM(sync_start_timeout, 10_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(full_sync_chains, std::set<uint32_t>{}); // full sync all chains if it is empty.
|
||||
CONFIG_HOT_UPDATED_ITEM(full_sync_level, FullSyncLevel::NONE);
|
||||
CONFIG_OBJ(pool, CoroutinesPoolBase::Config);
|
||||
CONFIG_OBJ(batch_concurrency_limiter, ConcurrencyLimiterConfig, [](auto &c) { c.set_max_concurrency(64); });
|
||||
};
|
||||
ResyncWorker(const Config &config, Components &components);
|
||||
|
||||
// start resync worker.
|
||||
Result<Void> start();
|
||||
|
||||
// stop resync worker. End all sync tasks immediately.
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
protected:
|
||||
void loop();
|
||||
|
||||
// handle sync job.
|
||||
CoTryTask<void> handleSync(VersionedChainId vChainId);
|
||||
|
||||
// forward sync request.
|
||||
CoTryTask<void> forward(const TargetPtr &target,
|
||||
const monitor::TagSet &tag,
|
||||
const ClientId &clientId,
|
||||
ChunkId chunkId,
|
||||
UpdateType updateType,
|
||||
uint32_t chunkSize);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::ResyncWorker"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
folly::CPUThreadPoolExecutor executors_;
|
||||
CoroutinesPool<VersionedChainId> pool_;
|
||||
client::UpdateChannelAllocator updateChannelAllocator_;
|
||||
ConcurrencyLimiter<uint32_t> batchConcurrencyLimiter_;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cond_;
|
||||
std::atomic<bool> stopping_ = false;
|
||||
std::atomic<bool> started_ = false;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
|
||||
struct SyncingStatus {
|
||||
SERDE_STRUCT_FIELD(isSyncing, false);
|
||||
SERDE_STRUCT_FIELD(lastSyncingTime, RelativeTime{});
|
||||
};
|
||||
using SyncingChainIds = robin_hood::unordered_map<ChainId, SyncingStatus>;
|
||||
Shards<SyncingChainIds, 32> shards_;
|
||||
std::atomic_uint64_t requestId_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
118
src/storage/update/UpdateJob.h
Normal file
118
src/storage/update/UpdateJob.h
Normal file
@@ -0,0 +1,118 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/experimental/coro/Baton.h>
|
||||
|
||||
#include "chunk_engine/src/cxx.rs.h"
|
||||
#include "fbs/storage/Common.h"
|
||||
#include "storage/store/ChunkMetadata.h"
|
||||
#include "storage/store/ChunkStore.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class StorageTarget;
|
||||
|
||||
class ChunkEngineUpdateJob {
|
||||
public:
|
||||
ChunkEngineUpdateJob() = default;
|
||||
ChunkEngineUpdateJob(const ChunkEngineUpdateJob &) = delete;
|
||||
ChunkEngineUpdateJob(ChunkEngineUpdateJob &&other)
|
||||
: engine_(std::exchange(other.engine_, nullptr)),
|
||||
chunk_(std::exchange(other.chunk_, nullptr)) {}
|
||||
|
||||
void set(chunk_engine::Engine &engine, chunk_engine::WritingChunk *chunk) {
|
||||
reset();
|
||||
engine_ = &engine;
|
||||
chunk_ = chunk;
|
||||
}
|
||||
|
||||
auto release() { return std::exchange(engine_, nullptr); }
|
||||
auto chunk() const { return chunk_; }
|
||||
|
||||
void reset() {
|
||||
if (engine_ && chunk_) {
|
||||
release()->release_writing_chunk(chunk_);
|
||||
}
|
||||
}
|
||||
|
||||
~ChunkEngineUpdateJob() { reset(); }
|
||||
|
||||
private:
|
||||
chunk_engine::Engine *engine_{};
|
||||
chunk_engine::WritingChunk *chunk_{};
|
||||
};
|
||||
|
||||
class UpdateJob {
|
||||
public:
|
||||
UpdateJob(ServiceRequestContext &requestCtx,
|
||||
const UpdateIO &updateIO,
|
||||
const UpdateOptions &options,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
std::shared_ptr<StorageTarget> target,
|
||||
bool allowToAllocate = true)
|
||||
: requestCtx_(requestCtx),
|
||||
type_(updateIO.updateType),
|
||||
chunkId_(updateIO.key.chunkId),
|
||||
target_(std::move(target)),
|
||||
updateIO_(updateIO),
|
||||
chunkEngineJob_(chunkEngineJob),
|
||||
options_(options),
|
||||
allowToAllocate_(allowToAllocate) {}
|
||||
|
||||
UpdateJob(ServiceRequestContext &requestCtx,
|
||||
const CommitIO &commitIO,
|
||||
const UpdateOptions &options,
|
||||
ChunkEngineUpdateJob &chunkEngineJob,
|
||||
std::shared_ptr<StorageTarget> target)
|
||||
: requestCtx_(requestCtx),
|
||||
type_(UpdateType::COMMIT),
|
||||
chunkId_(commitIO.key.chunkId),
|
||||
target_(std::move(target)),
|
||||
commitIO_(commitIO),
|
||||
chunkEngineJob_(chunkEngineJob),
|
||||
options_(options) {}
|
||||
|
||||
auto &requestCtx() { return requestCtx_; }
|
||||
auto type() const { return type_; }
|
||||
const auto &chunkId() const { return chunkId_; }
|
||||
auto &target() const { return target_; }
|
||||
auto &updateIO() { return updateIO_; }
|
||||
auto &commitIO() { return commitIO_; }
|
||||
auto &chunkEngineJob() { return chunkEngineJob_; }
|
||||
auto &options() { return options_; }
|
||||
auto &result() { return result_; }
|
||||
auto &state() { return state_; }
|
||||
auto allowToAllocate() const { return allowToAllocate_; }
|
||||
ChainVer commitChainVer() const {
|
||||
if (options_.isSyncing) {
|
||||
return options_.commitChainVer;
|
||||
} else if (type() == UpdateType::COMMIT) {
|
||||
return commitIO_.commitChainVer;
|
||||
} else {
|
||||
return updateIO_.key.vChainId.chainVer;
|
||||
}
|
||||
}
|
||||
|
||||
CoTask<void> complete() const { co_await baton_; }
|
||||
void setResult(Result<uint32_t> result) {
|
||||
result_.lengthInfo = std::move(result);
|
||||
baton_.post();
|
||||
}
|
||||
|
||||
protected:
|
||||
ServiceRequestContext &requestCtx_;
|
||||
UpdateType type_;
|
||||
ChunkId chunkId_;
|
||||
std::shared_ptr<StorageTarget> target_;
|
||||
UpdateIO updateIO_;
|
||||
CommitIO commitIO_;
|
||||
ChunkEngineUpdateJob &chunkEngineJob_;
|
||||
UpdateOptions options_;
|
||||
IOResult result_;
|
||||
folly::coro::Baton baton_;
|
||||
struct State {
|
||||
const uint8_t *data = nullptr;
|
||||
} state_;
|
||||
bool allowToAllocate_ = true;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
46
src/storage/update/UpdateWorker.cc
Normal file
46
src/storage/update/UpdateWorker.cc
Normal file
@@ -0,0 +1,46 @@
|
||||
#include "storage/update/UpdateWorker.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
Result<Void> UpdateWorker::start(uint32_t numberOfDisks) {
|
||||
if (config_.num_threads() < numberOfDisks) {
|
||||
return makeError(StatusCode::kInvalidConfig,
|
||||
fmt::format("too few update worker threads, {} < {}", config_.num_threads(), numberOfDisks));
|
||||
}
|
||||
|
||||
queueVec_.reserve(numberOfDisks);
|
||||
for (auto i = 0u; i < numberOfDisks; ++i) {
|
||||
queueVec_.emplace_back(std::make_unique<Queue>(config_.queue_size()));
|
||||
}
|
||||
|
||||
for (auto i = 0u; i < config_.num_threads(); ++i) {
|
||||
executors_.add([this, i] { run(*queueVec_[i % queueVec_.size()]); });
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void UpdateWorker::stopAndJoin() {
|
||||
if (stopped_.test_and_set()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto i = 0u; i < config_.num_threads() && !queueVec_.empty(); ++i) {
|
||||
queueVec_[i % queueVec_.size()]->enqueue(nullptr);
|
||||
}
|
||||
executors_.join();
|
||||
bgExecutors_.join();
|
||||
}
|
||||
|
||||
void UpdateWorker::run(Queue &queue) {
|
||||
while (true) {
|
||||
auto job = queue.dequeue();
|
||||
if (UNLIKELY(job == nullptr)) {
|
||||
XLOGF(DBG, "Storage worker {} stop...", fmt::ptr(this));
|
||||
break;
|
||||
}
|
||||
|
||||
job->target()->updateChunk(*job, bgExecutors_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
48
src/storage/update/UpdateWorker.h
Normal file
48
src/storage/update/UpdateWorker.h
Normal file
@@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
|
||||
#include "common/utils/BoundedQueue.h"
|
||||
#include "storage/store/StorageTargets.h"
|
||||
#include "storage/update/UpdateJob.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
class UpdateWorker {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_ITEM(queue_size, 4096u);
|
||||
CONFIG_ITEM(num_threads, 32ul);
|
||||
CONFIG_ITEM(bg_num_threads, 8ul);
|
||||
};
|
||||
|
||||
UpdateWorker(const Config &config)
|
||||
: config_(config),
|
||||
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
|
||||
std::make_shared<folly::NamedThreadFactory>("Update")),
|
||||
bgExecutors_(std::make_pair(config_.bg_num_threads(), config_.bg_num_threads()),
|
||||
std::make_shared<folly::NamedThreadFactory>("Recycle")) {}
|
||||
~UpdateWorker() { stopAndJoin(); }
|
||||
|
||||
Result<Void> start(uint32_t numberOfDisks);
|
||||
void stopAndJoin();
|
||||
|
||||
CoTask<void> enqueue(UpdateJob *job) {
|
||||
assert(job->target()->diskIndex() < queueVec_.size());
|
||||
co_await queueVec_[job->target()->diskIndex()]->co_enqueue(job);
|
||||
}
|
||||
|
||||
protected:
|
||||
using Queue = BoundedQueue<UpdateJob *>;
|
||||
void run(Queue &queue);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::UpdateWorker"> constructLog_;
|
||||
const Config &config_;
|
||||
std::vector<std::unique_ptr<Queue>> queueVec_;
|
||||
folly::CPUThreadPoolExecutor executors_;
|
||||
folly::CPUThreadPoolExecutor bgExecutors_;
|
||||
std::atomic_flag stopped_;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
59
src/storage/worker/AllocateWorker.cc
Normal file
59
src/storage/worker/AllocateWorker.cc
Normal file
@@ -0,0 +1,59 @@
|
||||
#include "storage/worker/AllocateWorker.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
AllocateWorker::AllocateWorker(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Allocate")) {}
|
||||
|
||||
Result<Void> AllocateWorker::start() {
|
||||
executors_.add([this] { loop(); });
|
||||
started_ = true;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> AllocateWorker::stopAndJoin() {
|
||||
stopping_ = true;
|
||||
cond_.notify_one();
|
||||
for (int i = 0; started_ && !stopped_; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for AllocateWorker@{}::loop stop...", fmt::ptr(this));
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
executors_.join();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void AllocateWorker::loop() {
|
||||
while (!stopping_) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
if (cond_.wait_for(lock, 100_ms, [&] { return stopping_.load(); })) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto minRemainGroups = config_.min_remain_groups();
|
||||
auto maxRemainGroups = config_.max_remain_groups();
|
||||
auto minRemainUltraGroups = config_.min_remain_ultra_groups();
|
||||
auto maxRemainUltraGroups = config_.max_remain_ultra_groups();
|
||||
auto maxReserved = config_.max_reserved_chunks();
|
||||
for (auto &engine : components_.storageTargets.engines()) {
|
||||
engine->allocate_groups(minRemainGroups, maxRemainGroups, 128);
|
||||
engine->allocate_ultra_groups(minRemainUltraGroups, maxRemainUltraGroups, 32);
|
||||
engine->compact_groups(maxReserved);
|
||||
}
|
||||
}
|
||||
|
||||
XLOGF(INFO, "AllocateWorker@{}::loop stopped", fmt::ptr(this));
|
||||
stopped_ = true;
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
46
src/storage/worker/AllocateWorker.h
Normal file
46
src/storage/worker/AllocateWorker.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <mutex>
|
||||
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
|
||||
class AllocateWorker {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(min_remain_groups, 4ul);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_remain_groups, 8ul);
|
||||
CONFIG_HOT_UPDATED_ITEM(min_remain_ultra_groups, 0ul); // greater than 4MiB
|
||||
CONFIG_HOT_UPDATED_ITEM(max_remain_ultra_groups, 4ul);
|
||||
CONFIG_HOT_UPDATED_ITEM(max_reserved_chunks, 1_GB);
|
||||
};
|
||||
|
||||
AllocateWorker(const Config &config, Components &components);
|
||||
|
||||
Result<Void> start();
|
||||
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
protected:
|
||||
void loop();
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::AllocateWorker"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
folly::CPUThreadPoolExecutor executors_;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cond_;
|
||||
std::atomic<bool> stopping_ = false;
|
||||
std::atomic<bool> started_ = false;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
284
src/storage/worker/CheckWorker.cc
Normal file
284
src/storage/worker/CheckWorker.cc
Normal file
@@ -0,0 +1,284 @@
|
||||
#include "storage/worker/CheckWorker.h"
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::ValueRecorder new_chunk_engine_count = monitor::ValueRecorder{"storage.chunk_engine.new", std::nullopt, false};
|
||||
monitor::ValueRecorder old_chunk_engine_count = monitor::ValueRecorder{"storage.chunk_engine.old", std::nullopt, false};
|
||||
|
||||
struct Recorders {
|
||||
monitor::ValueRecorder disk_capacity;
|
||||
monitor::ValueRecorder disk_readonly;
|
||||
monitor::ValueRecorder disk_available;
|
||||
monitor::ValueRecorder disk_free;
|
||||
monitor::OperationRecorder check_disk;
|
||||
monitor::ValueRecorder position_count;
|
||||
monitor::ValueRecorder position_rc;
|
||||
monitor::CountRecorder copy_on_write_times;
|
||||
monitor::LatencyRecorder copy_on_write_latency;
|
||||
monitor::CountRecorder copy_on_write_read_times;
|
||||
monitor::CountRecorder copy_on_write_read_bytes;
|
||||
monitor::LatencyRecorder copy_on_write_read_latency;
|
||||
monitor::CountRecorder checksum_reuse;
|
||||
monitor::CountRecorder checksum_combine;
|
||||
monitor::CountRecorder checksum_recalculate;
|
||||
monitor::CountRecorder safe_write_direct_append;
|
||||
monitor::CountRecorder safe_write_indirect_append;
|
||||
monitor::CountRecorder safe_write_truncate_shorten;
|
||||
monitor::CountRecorder safe_write_truncate_extend;
|
||||
monitor::CountRecorder safe_write_read_tail_times;
|
||||
monitor::CountRecorder safe_write_read_tail_bytes;
|
||||
monitor::CountRecorder allocate_times;
|
||||
monitor::LatencyRecorder allocate_latency;
|
||||
monitor::CountRecorder pwrite_times;
|
||||
monitor::LatencyRecorder pwrite_latency;
|
||||
|
||||
Recorders(const monitor::TagSet &tag)
|
||||
: disk_capacity("storage.disk_info.capacity", tag, false),
|
||||
disk_readonly("storage.disk_info.read_only", tag, false),
|
||||
disk_available("storage.disk_info.available", tag, false),
|
||||
disk_free("storage.disk_info.free", tag, false),
|
||||
check_disk("storage.check_disk", tag),
|
||||
position_count("storage.chunk_engine.position_count", tag, false),
|
||||
position_rc("storage.chunk_engine.position_rc", tag, false),
|
||||
copy_on_write_times("storage.chunk_engine.copy_on_write_times", tag),
|
||||
copy_on_write_latency("storage.chunk_engine.copy_on_write_latency", tag),
|
||||
copy_on_write_read_times("storage.chunk_engine.copy_on_write_read_times", tag),
|
||||
copy_on_write_read_bytes("storage.chunk_engine.copy_on_write_read_bytes", tag),
|
||||
copy_on_write_read_latency("storage.chunk_engine.copy_on_write_read_latency", tag),
|
||||
checksum_reuse("storage.chunk_engine.checksum_reuse", tag),
|
||||
checksum_combine("storage.chunk_engine.checksum_combine", tag),
|
||||
checksum_recalculate("storage.chunk_engine.checksum_recalculate", tag),
|
||||
safe_write_direct_append("storage.chunk_engine.safe_write_direct_append", tag),
|
||||
safe_write_indirect_append("storage.chunk_engine.safe_write_indirect_append", tag),
|
||||
safe_write_truncate_shorten("storage.chunk_engine.safe_write_truncate_shorten", tag),
|
||||
safe_write_truncate_extend("storage.chunk_engine.safe_write_truncate_extend", tag),
|
||||
safe_write_read_tail_times("storage.chunk_engine.safe_write_read_tail_times", tag),
|
||||
safe_write_read_tail_bytes("storage.chunk_engine.safe_write_read_tail_bytes", tag),
|
||||
allocate_times("storage.chunk_engine.allocate_times", tag),
|
||||
allocate_latency("storage.chunk_engine.allocate_latency", tag),
|
||||
pwrite_times("storage.chunk_engine.pwrite_times", tag),
|
||||
pwrite_latency("storage.chunk_engine.pwrite_latency", tag) {}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
CheckWorker::CheckWorker(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Check")) {}
|
||||
|
||||
// start check worker.
|
||||
Result<Void> CheckWorker::start(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers) {
|
||||
executors_.add([this, targetPaths, manufacturers] { loop(targetPaths, manufacturers); });
|
||||
started_ = true;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// stop check worker. End all tasks immediately.
|
||||
Result<Void> CheckWorker::stopAndJoin() {
|
||||
stopping_ = true;
|
||||
cond_.notify_one();
|
||||
for (int i = 0; started_ && !stopped_; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for CheckWorker@{}::loop stop...", fmt::ptr(this));
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
executors_.join();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void CheckWorker::loop(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers) {
|
||||
(void)manufacturers;
|
||||
|
||||
// 0. initialize records.
|
||||
static auto recorders = [&] {
|
||||
std::vector<std::unique_ptr<Recorders>> recorders;
|
||||
for (auto i = 0ul; i < targetPaths.size(); ++i) {
|
||||
monitor::TagSet tag;
|
||||
tag.addTag("instance", std::to_string(i));
|
||||
recorders.push_back(std::make_unique<Recorders>(tag));
|
||||
}
|
||||
return recorders;
|
||||
}();
|
||||
|
||||
RelativeTime lastCheckDiskStatusTime{};
|
||||
RelativeTime lastCleanUpExpiredClientsTime{};
|
||||
RelativeTime lastTriggerHeartbeatTime{};
|
||||
RelativeTime lastUpdateTargetUsedSizeTime = RelativeTime::now();
|
||||
RelativeTime lastChunkEngineMetricsReportTime = RelativeTime::now();
|
||||
robin_hood::unordered_map<uint32_t, double> diskUsage;
|
||||
while (!stopping_) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
if (cond_.wait_for(lock, 100_ms, [&] { return stopping_.load(); })) {
|
||||
break;
|
||||
}
|
||||
|
||||
// 1. reload offline targets.
|
||||
{
|
||||
auto snapshot = components_.targetMap.snapshot();
|
||||
for (auto &[targetId, target] : snapshot->getTargets()) {
|
||||
if (target.unrecoverableOffline()) {
|
||||
continue;
|
||||
}
|
||||
if (target.localState == flat::LocalTargetState::OFFLINE) {
|
||||
if (target.weakStorageTarget.expired()) {
|
||||
auto result = components_.storageTargets.loadTarget(target.path);
|
||||
if (UNLIKELY(!result)) {
|
||||
XLOGF(ERR, "CheckWorker@{} reload target {} failed", fmt::ptr(this), target.path);
|
||||
} else {
|
||||
XLOGF(INFO, "CheckWorker@{} reload target {} succ", fmt::ptr(this), target.path);
|
||||
components_.refreshRoutingInfo();
|
||||
}
|
||||
} else {
|
||||
XLOGF(WARNING, "CheckWorker@{} offline target {} is still being used", fmt::ptr(this), target.path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. check disk status.
|
||||
auto now = RelativeTime::now();
|
||||
auto diskLowSpaceThreshold = config_.disk_low_space_threshold();
|
||||
auto diskRejectCreateChunkThreshold = config_.disk_reject_create_chunk_threshold();
|
||||
if (now - lastCheckDiskStatusTime >= 3_s) {
|
||||
lastCheckDiskStatusTime = now;
|
||||
XLOGF(DBG9, "check disk status start");
|
||||
for (auto i = 0ul; i < targetPaths.size(); ++i) {
|
||||
auto &targetPath = targetPaths[i];
|
||||
auto &recorder = *recorders[i];
|
||||
boost::system::error_code ec{};
|
||||
auto spaceInfo = boost::filesystem::space(targetPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
XLOGF(CRITICAL, "check disk failed {}, errno: {}", targetPath, ec.message());
|
||||
components_.targetMap.offlineTargets(targetPath);
|
||||
continue;
|
||||
}
|
||||
|
||||
recorder.disk_capacity.set(spaceInfo.capacity);
|
||||
recorder.disk_free.set(spaceInfo.available);
|
||||
diskUsage[i] = 1.0 - (double)spaceInfo.available / std::max(1ul, spaceInfo.capacity);
|
||||
|
||||
auto recordGuard = recorder.check_disk.record();
|
||||
bool writable = checkWritable(targetPath);
|
||||
if (!writable) {
|
||||
recorder.disk_readonly.set(1);
|
||||
XLOGF(CRITICAL, "check disk failed {}, readonly", targetPath);
|
||||
components_.targetMap.offlineTargets(targetPath);
|
||||
continue;
|
||||
}
|
||||
recordGuard.report(true);
|
||||
|
||||
bool lowSpace = diskUsage[i] >= diskLowSpaceThreshold;
|
||||
bool rejectCreateChunk = diskUsage[i] >= diskRejectCreateChunkThreshold;
|
||||
components_.storageTargets.engines()[i]->set_allow_to_allocate(!rejectCreateChunk);
|
||||
components_.targetMap.updateDiskState(targetPath, lowSpace, rejectCreateChunk);
|
||||
}
|
||||
XLOGF(DBG9, "check disk status finished");
|
||||
}
|
||||
|
||||
// 3. clean up expired clients.
|
||||
now = RelativeTime::now();
|
||||
if (now - lastCleanUpExpiredClientsTime >= 60_s) {
|
||||
lastCleanUpExpiredClientsTime = now;
|
||||
auto result = components_.getActiveClientsList();
|
||||
if (result) {
|
||||
components_.reliableUpdate.cleanUpExpiredClients(*result);
|
||||
} else if (result.error().code() != StorageClientCode::kRoutingError) {
|
||||
XLOGF(ERR, "get active clients list error: {}", result.error());
|
||||
}
|
||||
}
|
||||
|
||||
// 4. update target used size.
|
||||
now = RelativeTime::now();
|
||||
auto emergencyRecyclingRatio = config_.emergency_recycling_ratio();
|
||||
if (now - lastUpdateTargetUsedSizeTime >= config_.update_target_size_interval()) {
|
||||
lastUpdateTargetUsedSizeTime = now;
|
||||
components_.targetMap.updateTargetUsedSize();
|
||||
|
||||
robin_hood::unordered_map<uint32_t, uint64_t> diskUnusedSize;
|
||||
robin_hood::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> chunkEngineCount;
|
||||
auto snapshot = components_.targetMap.snapshot();
|
||||
for (auto &[targetId, target] : snapshot->getTargets()) {
|
||||
if (!target.unrecoverableOffline() && target.localState != flat::LocalTargetState::OFFLINE &&
|
||||
target.storageTarget != nullptr) {
|
||||
target.storageTarget->reportUnrecycledSize();
|
||||
target.storageTarget->setEmergencyRecycling(diskUsage[target.diskIndex] >= emergencyRecyclingRatio);
|
||||
diskUnusedSize[target.diskIndex] += target.storageTarget->unusedSize();
|
||||
if (target.storageTarget->useChunkEngine()) {
|
||||
chunkEngineCount[target.diskIndex].first++;
|
||||
} else {
|
||||
chunkEngineCount[target.diskIndex].second++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto i = 0ul; i < targetPaths.size(); ++i) {
|
||||
auto tag = monitor::instanceTagSet(std::to_string(i));
|
||||
auto [new_count, old_count] = chunkEngineCount[i];
|
||||
new_chunk_engine_count.set(new_count, tag);
|
||||
old_chunk_engine_count.set(old_count, tag);
|
||||
auto rawUsedSize = components_.storageTargets.engines()[i]->raw_used_size();
|
||||
auto &recorder = *recorders[i];
|
||||
recorder.disk_available.set(diskUnusedSize[i] + rawUsedSize.reserved_size + recorder.disk_free.value());
|
||||
recorder.position_count.set(rawUsedSize.position_count);
|
||||
recorder.position_rc.set(rawUsedSize.position_rc);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. trigger heartbeat if need.
|
||||
if (now - lastTriggerHeartbeatTime >= 1_s) {
|
||||
lastTriggerHeartbeatTime = now;
|
||||
components_.triggerHeartbeatIfNeed();
|
||||
}
|
||||
|
||||
// 6. report chunk engine metrics.
|
||||
now = RelativeTime::now();
|
||||
if (now - lastChunkEngineMetricsReportTime >= 1_s) {
|
||||
lastChunkEngineMetricsReportTime = now;
|
||||
for (auto i = 0ul; i < targetPaths.size(); ++i) {
|
||||
auto &recorder = *recorders[i];
|
||||
auto metrics = components_.storageTargets.engines()[i]->get_metrics();
|
||||
recorder.copy_on_write_times.addSample(metrics.copy_on_write_times);
|
||||
if (metrics.copy_on_write_latency) {
|
||||
recorder.copy_on_write_latency.addSample(std::chrono::microseconds(metrics.copy_on_write_latency));
|
||||
}
|
||||
recorder.copy_on_write_read_times.addSample(metrics.copy_on_write_read_times);
|
||||
recorder.copy_on_write_read_bytes.addSample(metrics.copy_on_write_read_bytes);
|
||||
if (metrics.copy_on_write_read_latency) {
|
||||
recorder.copy_on_write_read_latency.addSample(std::chrono::microseconds(metrics.copy_on_write_read_latency));
|
||||
}
|
||||
recorder.checksum_reuse.addSample(metrics.checksum_reuse);
|
||||
recorder.checksum_combine.addSample(metrics.checksum_combine);
|
||||
recorder.checksum_recalculate.addSample(metrics.checksum_recalculate);
|
||||
recorder.safe_write_direct_append.addSample(metrics.safe_write_direct_append);
|
||||
recorder.safe_write_indirect_append.addSample(metrics.safe_write_indirect_append);
|
||||
recorder.safe_write_truncate_shorten.addSample(metrics.safe_write_truncate_shorten);
|
||||
recorder.safe_write_truncate_extend.addSample(metrics.safe_write_truncate_extend);
|
||||
recorder.safe_write_read_tail_times.addSample(metrics.safe_write_read_tail_times);
|
||||
recorder.safe_write_read_tail_bytes.addSample(metrics.safe_write_read_tail_bytes);
|
||||
recorder.allocate_times.addSample(metrics.allocate_times);
|
||||
if (metrics.allocate_latency) {
|
||||
recorder.allocate_latency.addSample(std::chrono::microseconds(metrics.allocate_latency));
|
||||
}
|
||||
recorder.pwrite_times.addSample(metrics.pwrite_times);
|
||||
if (metrics.pwrite_latency) {
|
||||
recorder.pwrite_latency.addSample(std::chrono::microseconds(metrics.pwrite_latency));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
stopped_ = true;
|
||||
XLOGF(INFO, "CheckWorker@{}::loop stopped", fmt::ptr(this));
|
||||
}
|
||||
|
||||
bool CheckWorker::checkWritable(const Path &path) {
|
||||
std::ofstream check(path / ".hf3fs_check", std::ios::out);
|
||||
return check && (check << fmt::format("{}", UtcTime{UtcClock::now()}));
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
51
src/storage/worker/CheckWorker.h
Normal file
51
src/storage/worker/CheckWorker.h
Normal file
@@ -0,0 +1,51 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <folly/executors/CPUThreadPoolExecutor.h>
|
||||
#include <mutex>
|
||||
|
||||
#include "common/utils/ConfigBase.h"
|
||||
#include "storage/service/TargetMap.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
|
||||
struct Components;
|
||||
|
||||
class CheckWorker {
|
||||
public:
|
||||
class Config : public ConfigBase<Config> {
|
||||
CONFIG_HOT_UPDATED_ITEM(update_target_size_interval, 10_s);
|
||||
CONFIG_HOT_UPDATED_ITEM(emergency_recycling_ratio, 0.95);
|
||||
CONFIG_HOT_UPDATED_ITEM(disk_low_space_threshold, 0.96);
|
||||
CONFIG_HOT_UPDATED_ITEM(disk_reject_create_chunk_threshold, 0.98);
|
||||
};
|
||||
|
||||
CheckWorker(const Config &config, Components &components);
|
||||
|
||||
// start check worker.
|
||||
Result<Void> start(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers);
|
||||
|
||||
// stop check worker. End all tasks immediately.
|
||||
Result<Void> stopAndJoin();
|
||||
|
||||
// check hf3fs path writable or not.
|
||||
static bool checkWritable(const Path &path);
|
||||
|
||||
protected:
|
||||
void loop(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers);
|
||||
|
||||
private:
|
||||
ConstructLog<"storage::CheckWorker"> constructLog_;
|
||||
const Config &config_;
|
||||
Components &components_;
|
||||
folly::CPUThreadPoolExecutor executors_;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable cond_;
|
||||
std::atomic<bool> stopping_ = false;
|
||||
std::atomic<bool> started_ = false;
|
||||
std::atomic<bool> stopped_ = false;
|
||||
};
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
178
src/storage/worker/DumpWorker.cc
Normal file
178
src/storage/worker/DumpWorker.cc
Normal file
@@ -0,0 +1,178 @@
|
||||
#include "storage/worker/DumpWorker.h"
|
||||
|
||||
#include <gperftools/profiler.h>
|
||||
#include <memory>
|
||||
#include <sys/times.h>
|
||||
|
||||
#include "common/monitor/Recorder.h"
|
||||
#include "common/utils/Duration.h"
|
||||
#include "common/utils/Result.h"
|
||||
#include "common/utils/UtcTime.h"
|
||||
#include "fbs/mgmtd/MgmtdTypes.h"
|
||||
#include "storage/service/Components.h"
|
||||
|
||||
namespace hf3fs::storage {
|
||||
namespace {
|
||||
|
||||
monitor::ValueRecorder cpuCores{"storage.sys.cpu_cores", std::nullopt, true};
|
||||
|
||||
}
|
||||
|
||||
DumpWorker::DumpWorker(const Config &config, Components &components)
|
||||
: config_(config),
|
||||
components_(components),
|
||||
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Dump")) {}
|
||||
|
||||
// start dump worker.
|
||||
Result<Void> DumpWorker::start(flat::NodeId id) {
|
||||
executors_.add([this] { loop(); });
|
||||
started_ = true;
|
||||
nodeId_ = id;
|
||||
return Void{};
|
||||
}
|
||||
|
||||
// stop dump worker. End all tasks immediately.
|
||||
Result<Void> DumpWorker::stopAndJoin() {
|
||||
stopping_ = true;
|
||||
cond_.notify_one();
|
||||
for (int i = 0; started_ && !stopped_; ++i) {
|
||||
XLOGF_IF(INFO, i % 5 == 0, "Waiting for DumpWorker@{}::loop stop...", fmt::ptr(this));
|
||||
std::this_thread::sleep_for(100_ms);
|
||||
}
|
||||
executors_.join();
|
||||
return Void{};
|
||||
}
|
||||
|
||||
void DumpWorker::loop() {
|
||||
RelativeTime lastDumpTime = RelativeTime::now();
|
||||
|
||||
struct tms last_tms {};
|
||||
struct tms cur_tms {};
|
||||
auto last_tck = times(&last_tms);
|
||||
|
||||
bool profiler = false;
|
||||
RelativeTime lastProfilerTime = RelativeTime::now();
|
||||
|
||||
while (!stopping_) {
|
||||
auto lock = std::unique_lock(mutex_);
|
||||
if (cond_.wait_for(lock, 1000_ms, [&] { return stopping_.load(); })) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto cur_tck = times(&cur_tms);
|
||||
if (last_tck < cur_tck && last_tms.tms_stime <= cur_tms.tms_stime && last_tms.tms_utime <= cur_tms.tms_utime) {
|
||||
auto elapsed = cur_tck - last_tck;
|
||||
auto usage = (cur_tms.tms_stime - last_tms.tms_stime) + (cur_tms.tms_utime - last_tms.tms_utime);
|
||||
auto cores = usage / elapsed;
|
||||
cpuCores.set(cores);
|
||||
if (!profiler && cores >= config_.high_cpu_usage_threshold()) {
|
||||
profiler = true;
|
||||
lastProfilerTime = RelativeTime::now();
|
||||
profilerStart(config_.dump_root_path());
|
||||
}
|
||||
}
|
||||
last_tck = cur_tck;
|
||||
last_tms = cur_tms;
|
||||
|
||||
if (profiler && RelativeTime::now() - lastProfilerTime >= 1_min) {
|
||||
ProfilerStop();
|
||||
profiler = false;
|
||||
}
|
||||
|
||||
// 1. dump all targets.
|
||||
auto now = RelativeTime::now();
|
||||
if (now - lastDumpTime >= config_.dump_interval()) {
|
||||
auto rootPath = config_.dump_root_path();
|
||||
if (rootPath.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dump(rootPath);
|
||||
lastDumpTime = now;
|
||||
last_tck = times(&last_tms);
|
||||
}
|
||||
}
|
||||
stopped_ = true;
|
||||
XLOGF(INFO, "DumpWorker@{}::loop stopped", fmt::ptr(this));
|
||||
}
|
||||
|
||||
Result<Void> DumpWorker::dump(const Path &rootPath) {
|
||||
auto hostname = SysResource::hostname().value_or("unknown");
|
||||
|
||||
auto dumpPath = rootPath / fmt::format("{:%F}", fmt::localtime(std::time(nullptr)));
|
||||
boost::system::error_code ec{};
|
||||
boost::filesystem::create_directories(dumpPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("dump meta create directory {} failed: {}", dumpPath, ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
std::map<std::string, std::weak_ptr<StorageTarget>> targets;
|
||||
{
|
||||
auto targetMap = components_.targetMap.snapshot();
|
||||
for (auto &[targetId, target] : targetMap->getTargets()) {
|
||||
if (target.localState != flat::LocalTargetState::OFFLINE && target.storageTarget != nullptr) {
|
||||
auto dumpFileName = fmt::format("{}.{}.{}.{}",
|
||||
target.vChainId.chainId.toUnderType(),
|
||||
target.vChainId.chainVer.toUnderType(),
|
||||
targetId.toUnderType(),
|
||||
hostname);
|
||||
targets[dumpFileName] = target.storageTarget;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &[name, weakTarget] : targets) {
|
||||
if (stopping_) {
|
||||
break;
|
||||
}
|
||||
auto target = weakTarget.lock();
|
||||
if (!target) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto dumpFilePath = dumpPath / name;
|
||||
std::unordered_map<ChunkId, ChunkMetadata> metas;
|
||||
auto dumpResult = target->getAllMetadataMap(metas);
|
||||
if (UNLIKELY(!dumpResult)) {
|
||||
XLOGF(ERR, "dump meta {} failed: {}", dumpFilePath, dumpResult.error());
|
||||
return makeError(std::move(dumpResult.error()));
|
||||
}
|
||||
target = nullptr;
|
||||
|
||||
std::ofstream dumpFile{dumpFilePath};
|
||||
if (UNLIKELY(!dumpFile)) {
|
||||
auto msg = fmt::format("dump meta create file failed: {}", dumpFilePath);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
auto bytes = serde::serializeBytes(metas);
|
||||
auto view = std::string_view{bytes};
|
||||
dumpFile.write(view.data(), view.size());
|
||||
if (UNLIKELY(!dumpFile)) {
|
||||
auto msg = fmt::format("dump meta write file failed: {}", dumpFilePath);
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
}
|
||||
return Void{};
|
||||
}
|
||||
|
||||
Result<Void> DumpWorker::profilerStart(const Path &rootPath) {
|
||||
auto dumpPath = rootPath / fmt::format("{:%F}", fmt::localtime(std::time(nullptr)));
|
||||
boost::system::error_code ec{};
|
||||
boost::filesystem::create_directories(dumpPath, ec);
|
||||
if (UNLIKELY(ec.failed())) {
|
||||
auto msg = fmt::format("dump meta create directory {} failed: {}", dumpPath, ec.message());
|
||||
XLOG(ERR, msg);
|
||||
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
|
||||
}
|
||||
|
||||
auto dumpFile = dumpPath / fmt::format("{}.{:%T}.perf", nodeId_.toUnderType(), fmt::localtime(std::time(nullptr)));
|
||||
ProfilerStart(dumpFile.c_str());
|
||||
|
||||
return Void{};
|
||||
}
|
||||
|
||||
} // namespace hf3fs::storage
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user