Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
add_crate(chunk_engine)
target_add_lib(storage core-app core-service memory-common storage-fbs mgmtd-client storage-client kv analytics aio chunk_engine profiler)
target_include_directories(storage PUBLIC ${CMAKE_SOURCE_DIR}/third_party/leveldb)
target_add_bin(storage_main "storage.cpp" storage jemalloc)

View File

@@ -0,0 +1,96 @@
#include "storage/aio/AioReadWorker.h"
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include <folly/system/ThreadName.h>
#include "common/monitor/Recorder.h"
#include "storage/aio/AioStatus.h"
#include "storage/aio/BatchReadJob.h"
namespace hf3fs::storage {
monitor::LatencyRecorder batchReadInQueueRecorder{"storage.batch_read_in_queue.latency"};
monitor::CountRecorder aioRunningThreadsCount{"storage.aio_running_threads.count", std::nullopt, false};
AioReadWorker::~AioReadWorker() { stopAndJoin(); }
Result<Void> AioReadWorker::start(const std::vector<int> &fds, const std::vector<struct iovec> &iovecs) {
uint32_t numThreads = config_.num_threads();
for (auto i = 0u; i < numThreads; ++i) {
executors_.add([&]() {
AioStatus aioStatus;
IoUringStatus ioUringStatus;
{
SCOPE_EXIT { ++initialized_; };
auto aioInitResult = aioStatus.init(config_.max_events());
if (UNLIKELY(!aioInitResult)) {
XLOGF(ERR, "aio status init failed: {}", aioInitResult.error());
*initResult_.lock() = std::move(aioInitResult);
return;
}
if (config_.enable_io_uring()) {
auto ioUringResult = ioUringStatus.init(config_.max_events(), fds, iovecs);
if (UNLIKELY(!ioUringResult)) {
XLOGF(ERR, "io uring status init failed: {}", ioUringResult.error());
*initResult_.lock() = std::move(ioUringResult);
return;
}
}
}
run(aioStatus, ioUringStatus);
});
}
for (int i = 0; initialized_ != numThreads; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for AioReadWorker@{}::run start...", fmt::ptr(this));
std::this_thread::sleep_for(100_ms);
}
RETURN_AND_LOG_ON_ERROR(*initResult_.lock());
return Void{};
}
Result<Void> AioReadWorker::stopAndJoin() {
for (auto i = 0u; i < config_.num_threads(); ++i) {
queue_.enqueue(AioReadJobIterator{});
}
executors_.join();
return Void{};
}
Result<Void> AioReadWorker::run(AioStatus &aioStatus, IoUringStatus &ioUringStatus) {
aioRunningThreadsCount.addSample(1);
auto guard = folly::makeGuard([] { aioRunningThreadsCount.addSample(-1); });
while (true) {
// 1. try to fetch a batch read job.
aioRunningThreadsCount.addSample(-1);
auto it = queue_.dequeue(); // waiting.
aioRunningThreadsCount.addSample(1);
if (it.isNull()) {
XLOGF(DBG, "Stop AioReadWorker {}...", fmt::ptr(this));
return Void{};
}
batchReadInQueueRecorder.addSample(RelativeTime::now() - it.startTime());
it->batch().resetStartTime();
IoStatus &status = config_.useIoUring() ? static_cast<IoStatus &>(ioUringStatus) : aioStatus;
status.setAioReadJobIterator(it);
do {
// 2. collect a batch of read jobs.
status.collect();
// 3. submit a batch of read jobs.
status.submit();
// 4. wait a batch of events.
while (status.inflight()) {
status.reap(config_.min_complete());
};
} while (status.hasUnfinishedBatchReadJob());
}
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,78 @@
#pragma once
#include <atomic>
#include <folly/Random.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <memory>
#include <vector>
#include "common/utils/BoundedQueue.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Result.h"
#include "storage/aio/AioStatus.h"
#include "storage/aio/BatchReadJob.h"
#include "storage/store/StorageTargets.h"
namespace hf3fs::storage {
class AioReadWorker {
public:
enum class IoEngine {
libaio,
io_uring,
random,
};
class Config : public ConfigBase<Config> {
CONFIG_ITEM(num_threads, 32ul);
CONFIG_ITEM(queue_size, 4096u);
CONFIG_ITEM(max_events, 512u);
CONFIG_ITEM(enable_io_uring, true);
CONFIG_HOT_UPDATED_ITEM(min_complete, 128u);
CONFIG_HOT_UPDATED_ITEM(wait_all_inflight, false); // deprecated.
CONFIG_HOT_UPDATED_ITEM(inflight_control_offset, 128); // deprecated.
CONFIG_HOT_UPDATED_ITEM(ioengine, IoEngine::libaio);
public:
inline bool useIoUring() const {
if (!enable_io_uring()) {
return false;
}
switch (ioengine()) {
case IoEngine::io_uring:
return true;
case IoEngine::libaio:
return false;
case IoEngine::random:
return folly::Random::rand32() & 1;
}
}
};
AioReadWorker(const Config &config)
: config_(config),
queue_(config.queue_size()),
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
std::make_shared<folly::NamedThreadFactory>("AioRead")) {}
~AioReadWorker();
CoTask<void> enqueue(AioReadJobIterator job) { co_await queue_.co_enqueue(job); }
Result<Void> start(const std::vector<int> &fds, const std::vector<struct iovec> &iovecs);
Result<Void> stopAndJoin();
protected:
Result<Void> run(AioStatus &aioStatus, IoUringStatus &ioUringStatus);
private:
ConstructLog<"storage::AioReadWorker"> constructLog_;
const Config &config_;
BoundedQueue<AioReadJobIterator> queue_;
folly::CPUThreadPoolExecutor executors_;
std::atomic<uint32_t> initialized_{};
folly::Synchronized<Result<Void>, std::mutex> initResult_{Void{}};
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,286 @@
#include "storage/aio/AioStatus.h"
#include <chrono>
#include <liburing.h>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "storage/aio/BatchReadJob.h"
namespace hf3fs::storage {
namespace {
monitor::DistributionRecorder inflightNum("storage.aio.inflight");
monitor::CountRecorder aioReadFailCount{"storage.aio.fail_count"};
monitor::CountRecorder aioReadEIOCount{"storage.aio.eio_count"};
monitor::OperationRecorder ioCollectRecorder{"storage.io_collect"};
monitor::OperationRecorder ioSubmitRecorder{"storage.io_submit"};
monitor::DistributionRecorder ioSubmitSize("storage.io_submit.size");
monitor::DistributionRecorder ioSubmitLoop("storage.io_submit.loop");
monitor::CountRecorder ioSubmitBadFd("storage.io_submit.badfd_count");
monitor::CountRecorder ioSubmitError("storage.io_submit.error_count");
monitor::OperationRecorder ioGetEventsRecorder{"storage.io_getevents"};
monitor::DistributionRecorder ioGetEventsSize("storage.io_getevents.size");
void setReadJobResult(void *raw, int64_t res) {
auto job = reinterpret_cast<AioReadJob *>(raw);
auto storageTarget = job->state().storageTarget;
if (res >= 0) {
auto latency = RelativeTime::now() - job->startTime();
storageTarget->recordRealRead(res, latency);
auto length = std::min(std::min(std::max(0l, res - job->state().headLength), int64_t(job->readIO().length)),
std::max(0l, int64_t(job->state().chunkLen) - job->readIO().offset));
if (UNLIKELY(length == 0 && job->readIO().length > 0)) {
XLOGF(WARNING, "read length is 0: {}, state: {}", job->readIO(), job->state());
}
job->setResult(length);
// WARNING: job is no longer available.
} else {
if (storageTarget == nullptr) {
aioReadFailCount.addSample(1);
} else {
aioReadFailCount.addSample(1, storageTarget->tag());
}
XLOGF(ERR,
"set read job failed: {}, state: {}, buf: {}, code: {}",
job->readIO(),
job->state(),
fmt::ptr(job->state().localbuf.ptr()),
-res);
job->setResult(makeError(StorageCode::kChunkReadFailed, fmt::format("errno: {}", -res)));
// WARNING: job is no longer available.
}
}
} // namespace
AioStatus::~AioStatus() {
if (aioContext_) {
::io_destroy(aioContext_);
}
}
Result<Void> AioStatus::init(uint32_t maxEvents) {
maxEvents_ = maxEvents;
// 1. init aio context.
int ret = ::io_setup(maxEvents, &aioContext_);
if (UNLIKELY(ret != 0)) {
auto msg = fmt::format("init aio context failed: {}, maxEvents {}", ret, maxEvents);
XLOG(ERR, msg);
return makeError(StatusCode::kInvalidConfig, std::move(msg));
}
// 2. init iocb.
iocbs_.resize(maxEvents);
availables_.reserve(maxEvents);
for (auto &iocb : iocbs_) {
availables_.push_back(&iocb);
}
events_.resize(maxEvents);
return Void{};
}
void AioStatus::collect() {
auto recordGuard = ioCollectRecorder.record();
while (availableToSubmit() && iterator_) {
auto &job = *iterator_++;
auto result = job.state().storageTarget->aioPrepareRead(job);
if (UNLIKELY(!result)) {
job.setResult(makeError(std::move(result.error())));
continue;
}
++readyToSubmit_;
++inflight_;
auto iocb = availables_.back();
availables_.pop_back();
auto &state = job.state();
job.resetStartTime();
::io_prep_pread(iocb, state.readFd, state.localbuf.ptr(), state.readLength, state.readOffset);
iocb->data = &job;
}
recordGuard.succ();
}
void AioStatus::submit() {
uint32_t submitStartPoint = availables_.size();
uint32_t loopCnt = 0;
while (readyToSubmit_) {
++loopCnt;
auto recordGuard = ioSubmitRecorder.record();
int ret = ::io_submit(aioContext_, readyToSubmit_, &availables_[submitStartPoint]);
auto elapsedTime = RelativeTime::now() - recordGuard.startTime();
if (UNLIKELY(elapsedTime >= 5_s)) {
XLOGF(WARNING, "io_submit took too long {}, submit {} ret {}", elapsedTime.asMs(), readyToSubmit_, ret);
}
if (ret >= 0) {
recordGuard.succ();
ioSubmitSize.addSample(ret);
submitStartPoint += ret;
readyToSubmit_ -= ret;
} else if (ret == -EAGAIN) {
continue;
} else if (ret == -EBADF) {
XLOGF(ERR, "aio submit bad file descriptor {}. ret: {}", availables_[submitStartPoint]->aio_fildes, ret);
// set failed and skip it.
ioSubmitBadFd.addSample(1);
setReadJobResult(availables_[submitStartPoint]->data, -EBADF);
availables_.push_back(availables_[submitStartPoint]);
++submitStartPoint;
--readyToSubmit_;
--inflight_;
} else {
ioSubmitError.addSample(1);
XLOGF(ERR, "Unrecoverable aio submit error. ret: {}", ret);
// set all jobs failed.
while (readyToSubmit_) {
setReadJobResult(availables_[submitStartPoint]->data, ret);
availables_.push_back(availables_[submitStartPoint]);
++submitStartPoint;
--readyToSubmit_;
--inflight_;
}
break;
}
}
ioSubmitLoop.addSample(loopCnt);
inflightNum.addSample(inflight());
}
void AioStatus::reap(uint32_t minCompleteIn) {
uint32_t minComplete = std::min(inflight(), minCompleteIn);
auto recordGuard = ioGetEventsRecorder.record();
int ret = ::io_getevents(aioContext_, minComplete, inflight(), events_.data(), nullptr);
if (LIKELY(ret >= 0)) {
recordGuard.succ();
ioGetEventsSize.addSample(ret);
inflight_ -= ret;
for (int i = 0; i < ret; ++i) {
auto &event = events_[i];
availables_.push_back(event.obj);
setReadJobResult(event.data, event.res);
}
} else if (ret == -EINTR) {
XLOGF(INFO, "aio is interrupted by a signal handler");
return;
} else {
XLOGF(ERR, "aio io_getevents error: {}", ret);
return;
}
}
IoUringStatus::~IoUringStatus() {
if (ring_.ring_fd) {
::io_uring_queue_exit(&ring_);
}
}
Result<Void> IoUringStatus::init(uint32_t maxEvents,
const std::vector<int> &fds,
const std::vector<struct iovec> &iovecs) {
maxEvents_ = maxEvents;
auto ret = ::io_uring_queue_init(maxEvents_, &ring_, 0);
if (UNLIKELY(ret != 0)) {
auto msg = fmt::format("init io uring failed: {}, maxEvents {}", ret, maxEvents);
XLOG(ERR, msg);
return makeError(StatusCode::kInvalidConfig, std::move(msg));
}
submittingJobs_.reserve(maxEvents_);
if (!fds.empty()) {
int ret = ::io_uring_register_files(&ring_, fds.data(), fds.size());
if (UNLIKELY(ret != 0)) {
auto msg = fmt::format("io_uring_register_files failed: {}, size: {}", ret, fds.size());
XLOG(ERR, msg);
return makeError(StatusCode::kInvalidConfig, std::move(msg));
}
}
if (!iovecs.empty()) {
int ret = ::io_uring_register_buffers(&ring_, iovecs.data(), iovecs.size());
if (UNLIKELY(ret != 0)) {
auto msg = fmt::format("io_uring_register_buffers failed: {}, size: {}", ret, iovecs.size());
XLOG(ERR, msg);
return makeError(StatusCode::kInvalidConfig, std::move(msg));
}
}
return Void{};
}
void IoUringStatus::collect() {
auto recordGuard = ioCollectRecorder.record();
while (availableToSubmit() && iterator_) {
auto &job = *iterator_++;
auto result = job.state().storageTarget->aioPrepareRead(job);
if (UNLIKELY(!result)) {
job.setResult(makeError(std::move(result.error())));
continue;
}
++inflight_;
auto &state = job.state();
job.resetStartTime();
struct io_uring_sqe *sqe = ::io_uring_get_sqe(&ring_);
assert(sqe != nullptr);
::io_uring_prep_read_fixed(sqe,
state.fdIndex.value_or(state.readFd),
state.localbuf.ptr(),
state.readLength,
state.readOffset,
state.bufferIndex);
if (state.fdIndex) {
sqe->flags |= IOSQE_FIXED_FILE;
}
::io_uring_sqe_set_data(sqe, &job);
submittingJobs_.push_back(&job);
}
recordGuard.succ();
}
void IoUringStatus::submit() {
auto recordGuard = ioSubmitRecorder.record();
int ret = ::io_uring_submit(&ring_);
if (LIKELY(ret >= 0)) {
assert(ret == (int)inflight_);
recordGuard.succ();
ioSubmitSize.addSample(ret);
} else {
XLOGF(CRITICAL, "io_uring submit error: {}", ret);
for (auto &job : submittingJobs_) {
setReadJobResult(job, ret);
}
inflight_ -= submittingJobs_.size();
}
submittingJobs_.clear();
}
void IoUringStatus::reap(uint32_t minCompleteIn) {
auto recordGuard = ioGetEventsRecorder.record();
io_uring_cqe *cqe = nullptr;
int ret = ::io_uring_wait_cqes(&ring_, &cqe, std::min(inflight(), minCompleteIn), nullptr, nullptr);
if (LIKELY(ret >= 0)) {
recordGuard.succ();
uint32_t cnt = 0;
unsigned head = 0;
io_uring_for_each_cqe(&ring_, head, cqe) {
++cnt;
setReadJobResult(::io_uring_cqe_get_data(cqe), cqe->res);
}
ioGetEventsSize.addSample(cnt);
inflight_ -= cnt;
::io_uring_cq_advance(&ring_, cnt);
} else if (ret == -EINTR) {
XLOGF(INFO, "io_uring is interrupted by a signal handler");
return;
} else {
XLOGF(ERR, "io_uring wait_cqes error: {}", ret);
return;
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,73 @@
#pragma once
#include <libaio.h>
#include <liburing.h>
#include <vector>
#include "storage/aio/BatchReadJob.h"
#include "storage/store/StorageTargets.h"
namespace hf3fs::storage {
class IoStatus {
public:
virtual ~IoStatus() = default;
bool hasUnfinishedBatchReadJob() const { return iterator_; }
void setAioReadJobIterator(AioReadJobIterator it) { iterator_ = it; }
bool availableToSubmit() const { return inflight_ < maxEvents_; }
uint32_t inflight() const { return inflight_; }
virtual void collect() = 0;
virtual void submit() = 0;
virtual void reap(uint32_t minCompleteIn) = 0;
protected:
AioReadJobIterator iterator_;
uint32_t maxEvents_ = 0;
uint32_t inflight_ = 0;
};
class AioStatus : public IoStatus {
public:
~AioStatus() override;
Result<Void> init(uint32_t maxEvents);
void collect() override;
void submit() override;
void reap(uint32_t minCompleteIn) override;
private:
uint32_t readyToSubmit_ = 0;
io_context_t aioContext_ = nullptr;
std::vector<struct iocb> iocbs_;
std::vector<struct iocb *> availables_;
std::vector<struct io_event> events_;
};
class IoUringStatus : public IoStatus {
public:
~IoUringStatus() override;
Result<Void> init(uint32_t maxEvents, const std::vector<int> &fds, const std::vector<struct iovec> &iovecs);
void collect() override;
void submit() override;
void reap(uint32_t minCompleteIn) override;
private:
struct io_uring ring_ {};
std::vector<AioReadJob *> submittingJobs_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,123 @@
#include "storage/aio/BatchReadJob.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "storage/store/StorageTarget.h"
namespace hf3fs::storage {
monitor::CountRecorder rdmaWriteCount{"storage.rdma_write.count"};
monitor::CountRecorder rdmaWriteFails{"storage.rdma_write.fails"};
monitor::CountRecorder rdmaWriteBytes{"storage.rdma_write.bytes"};
monitor::LatencyRecorder batchReadLatency{"storage.aio.batch_latency"};
monitor::CountRecorder aioChecksumMismatch{"storage.aio.checksum_mismatch"};
AioReadJob::AioReadJob(const ReadIO &readIO, IOResult &result, BatchReadJob &batch)
: readIO_(readIO),
result_(result),
batch_(batch) {
state_.headLength = readIO_.offset % kAIOAlignSize;
state_.tailLength = (kAIOAlignSize - (readIO_.offset + readIO_.length) % kAIOAlignSize) % kAIOAlignSize;
}
void AioReadJob::setResult(Result<uint32_t> lengthInfo) {
if (lengthInfo) {
auto checksumType = batch_.checksumType();
if (checksumType == ChecksumType::NONE) {
result_.checksum = {ChecksumType::NONE, 0U}; // do not return checksum
} else if (checksumType == state_.chunkChecksum.type && readIO_.offset == 0 && *lengthInfo == state_.chunkLen) {
result_.checksum = state_.chunkChecksum; // use chunk checksum if the full chunk is read
} else { // calculate checksum of the read data
auto dataBuf = state_.localbuf.subrange(state_.headLength, *lengthInfo);
result_.checksum = ChecksumInfo::create(checksumType, dataBuf.ptr(), dataBuf.size());
}
// check chunk version.
auto result = state_.storageTarget->aioFinishRead(*this);
if (UNLIKELY(!result)) {
lengthInfo = makeError(std::move(result.error()));
}
if (batch_.recalculateChecksum() && readIO_.offset == 0 && *lengthInfo == state_.chunkLen) {
auto realChecksum = ChecksumInfo::create(state_.chunkChecksum.type, state_.localbuf.ptr(), *lengthInfo);
if (UNLIKELY(realChecksum != state_.chunkChecksum)) {
aioChecksumMismatch.addSample(1);
auto msg = fmt::format("aio checksum mismatch, read: {}, state: {}, checksum: {}",
readIO(),
state(),
realChecksum.value);
XLOG(CRITICAL, msg);
lengthInfo = makeError(StorageCode::kChecksumMismatch, std::move(msg));
}
}
}
XLOGF_IF(WARN, !lengthInfo, "Read job failed, result: {}, read io: {}, state: {}", lengthInfo, readIO_, state_);
XLOGF(DBG7, "Read job completed, result: {}, read io: {}, state: {}", lengthInfo, readIO_, state_);
result_.lengthInfo = std::move(lengthInfo);
state_.chunkEngineJob.reset();
batch_.finish(this);
}
BatchReadJob::BatchReadJob(std::span<const ReadIO> readIOs, std::span<IOResult> results, ChecksumType checksumType)
: checksumType_(checksumType) {
auto batchSize = readIOs.size();
jobs_.reserve(batchSize);
for (auto i = 0ul; i < batchSize; ++i) {
jobs_.emplace_back(readIOs[i], results[i], *this);
}
}
size_t BatchReadJob::addBufferToBatch(serde::CallContext::RDMATransmission &batch) {
size_t writeCount = 0;
size_t writeBytes = 0;
for (auto &job : jobs_) {
if (job.result().lengthInfo) {
auto length = *job.result().lengthInfo;
auto localbuf = job.state().localbuf.subrange(job.state().headLength, length);
auto result = batch.add(job.readIO().rdmabuf, localbuf);
if (UNLIKELY(!result)) {
rdmaWriteFails.addSample(1);
job.result().lengthInfo = makeError(std::move(result.error()));
} else {
++writeCount;
writeBytes += length;
}
}
}
rdmaWriteCount.addSample(writeCount);
rdmaWriteBytes.addSample(writeBytes);
return writeBytes;
}
size_t BatchReadJob::copyToRespBuffer(std::vector<uint8_t> &buffer) {
size_t sendBytes = 0;
for (auto &job : jobs_) {
if (job.result().lengthInfo) {
// check chunk version.
auto length = *job.result().lengthInfo;
auto localbuf = job.state().localbuf.subrange(job.state().headLength, length);
size_t bufEnd = buffer.size();
if (buffer.empty()) buffer.reserve(localbuf.size() * jobs_.size());
buffer.resize(buffer.size() + localbuf.size());
std::memcpy(&buffer[bufEnd], localbuf.ptr(), localbuf.size());
sendBytes += length;
}
}
return sendBytes;
}
void BatchReadJob::finish(AioReadJob *job) {
(void)job;
if (++finishedCount_ == jobs_.size()) {
batchReadLatency.addSample(RelativeTime::now() - startTime());
baton_.post();
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,145 @@
#pragma once
#include <folly/experimental/coro/Baton.h>
#include <utility>
#include "chunk_engine/src/cxx.rs.h"
#include "common/net/ib/IBSocket.h"
#include "common/serde/CallContext.h"
#include "common/utils/Duration.h"
#include "fbs/storage/Common.h"
#include "storage/store/ChunkMetadata.h"
namespace hf3fs::storage {
class BatchReadJob;
class StorageTarget;
class ChunkEngineReadJob {
public:
ChunkEngineReadJob() = default;
ChunkEngineReadJob(const ChunkEngineReadJob &) = delete;
ChunkEngineReadJob(ChunkEngineReadJob &&other)
: engine_(std::exchange(other.engine_, nullptr)),
chunk_(std::exchange(other.chunk_, nullptr)) {}
void set(chunk_engine::Engine *engine, const chunk_engine::Chunk *chunk) {
reset();
engine_ = engine;
chunk_ = chunk;
}
void reset() {
if (engine_ && chunk_) {
std::exchange(engine_, nullptr)->release_raw_chunk(chunk_);
}
}
auto chunk() const { return chunk_; }
bool has_chunk() const { return chunk_ != nullptr; }
~ChunkEngineReadJob() { reset(); }
private:
chunk_engine::Engine *engine_{};
const chunk_engine::Chunk *chunk_{};
};
class AioReadJob {
public:
AioReadJob(const ReadIO &readIO, IOResult &result, BatchReadJob &batch);
auto &readIO() { return readIO_; }
auto &result() { return result_; }
auto &batch() { return batch_; }
auto &state() { return state_; }
void setResult(Result<uint32_t> lengthInfo);
uint32_t alignedOffset() const { return readIO_.offset - state_.headLength; }
uint32_t alignedLength() const { return readIO_.length + state_.headLength + state_.tailLength; }
auto startTime() const { return startTime_; }
void resetStartTime() { startTime_ = RelativeTime::now(); }
private:
const ReadIO &readIO_;
IOResult &result_;
BatchReadJob &batch_;
struct State {
net::RDMABuf localbuf{};
StorageTarget *storageTarget = nullptr;
ChunkEngineReadJob chunkEngineJob{};
SERDE_STRUCT_FIELD(headLength, uint32_t{});
SERDE_STRUCT_FIELD(tailLength, uint32_t{});
SERDE_STRUCT_FIELD(readLength, uint32_t{}); // after cropping.
SERDE_STRUCT_FIELD(readFd, int32_t{});
SERDE_STRUCT_FIELD(readOffset, uint64_t{});
SERDE_STRUCT_FIELD(chunkLen, uint32_t{});
SERDE_STRUCT_FIELD(bufferIndex, uint32_t{});
SERDE_STRUCT_FIELD(fdIndex, std::optional<uint32_t>{});
SERDE_STRUCT_FIELD(chunkChecksum, ChecksumInfo{});
SERDE_STRUCT_FIELD(readUncommitted, false);
} state_;
static_assert(serde::Serializable<State>);
RelativeTime startTime_{};
};
class BatchReadJob {
public:
BatchReadJob(std::span<const ReadIO> readIOs, std::span<IOResult> results, ChecksumType checksumType);
BatchReadJob(const ReadIO &readIO, StorageTarget *target, IOResult &result, ChecksumType checksumType)
: BatchReadJob(std::span(&readIO, 1), std::span(&result, 1), checksumType) {
jobs_.back().state().storageTarget = target;
}
CoTask<void> complete() { co_await baton_; }
size_t addBufferToBatch(serde::CallContext::RDMATransmission &batch);
size_t copyToRespBuffer(std::vector<uint8_t> &buffer);
void finish(AioReadJob *job);
auto checksumType() const { return checksumType_; }
bool recalculateChecksum() const { return recalculateChecksum_; }
void setRecalculateChecksum(bool value = true) { recalculateChecksum_ = value; }
auto &front() { return jobs_.front(); }
auto &front() const { return jobs_.front(); }
auto startTime() const { return startTime_.load(); }
void resetStartTime() { startTime_ = RelativeTime::now(); }
private:
friend class AioReadJobIterator;
std::vector<AioReadJob> jobs_;
folly::coro::Baton baton_;
std::atomic<uint64_t> finishedCount_{};
std::atomic<RelativeTime> startTime_ = RelativeTime::now();
const ChecksumType checksumType_;
bool recalculateChecksum_ = false;
};
class AioReadJobIterator {
public:
AioReadJobIterator() = default;
AioReadJobIterator(BatchReadJob *batch)
: batch_(batch),
end_(batch->jobs_.size()) {}
AioReadJobIterator(BatchReadJob *batch, uint32_t start, uint32_t size)
: batch_(batch),
begin_(start),
end_(std::min((uint32_t)batch->jobs_.size(), start + size)) {}
operator bool() const { return begin_ < end_; }
bool isNull() const { return batch_ == nullptr; }
AioReadJob &operator*() { return batch_->jobs_[begin_]; }
AioReadJob *operator->() { return &batch_->jobs_[begin_]; }
AioReadJob *operator++(int) { return &batch_->jobs_[begin_++]; }
auto startTime() const { return startTime_; }
auto resetStartTime() { startTime_ = RelativeTime::now(); }
private:
BatchReadJob *batch_ = nullptr;
uint32_t begin_ = 0;
uint32_t end_ = 0;
RelativeTime startTime_ = RelativeTime::now();
};
} // namespace hf3fs::storage

2
src/storage/chunk_engine/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/target
/lcov.info

View File

@@ -0,0 +1,40 @@
[package]
name = "chunk_engine"
version = "0.1.11"
edition = "2021"
[lib]
crate-type = ["lib", "staticlib"]
[dependencies]
anyhow = "1"
byteorder = "1"
crc32c = "0"
cxx = "1"
dashmap = "6"
derse = { version = ">=0.1.32", features = ["tinyvec"] }
lazy_static = "1"
libc = "0"
lockmap = "0.1.6"
rand = "0"
rocksdb = "0"
rolling-file = "0"
serde = { version = "1", features = ["derive"] }
static_assertions = "1"
tinyvec = { version = "1", features = ["alloc"] }
toml = "0"
tracing = "0"
tracing-appender = "0"
tracing-subscriber = { version = "0", features = ["fmt"] }
[dev-dependencies]
clap = { version = "4", features = ["derive"] }
tempfile = "3"
criterion = "0"
[build-dependencies]
cxx-build = "1"
[[bench]]
name = "bench_allocator"
harness = false

View File

@@ -0,0 +1,62 @@
# chunk-engine
### Design
1. The entire Chunk Engine can be divided into two components:
1. **Allocator**: Responsible for allocating/reclaiming chunks and modifying memory states.
2. **MetaStore**: Responsible for persisting allocation/reclamation events.
2. Workflow for writing a new chunk:
1. The **Allocator** assigns a new chunk position, pointing to a disk space (purely in-memory operation).
2. Write data to this chunk position. If a power failure or write failure occurs at this stage, no existing data is affected.
3. Generate corresponding chunk metadata and persist it alongside the allocation event to the **MetaStore**. Using RocksDB's WriteBatch ensures **atomic** updates—the entire write operation either succeeds or fails, with no intermediate states.
3. Maintaining the Allocator's in-memory state:
1. At startup, the Allocator **quickly** loads all allocation information from RocksDB.
2. Allocation is performed in-memory first, followed by persistence. If a failure occurs before persistence, the allocation event is lost.
3. Reclamation first persists the event to disk, then modifies the memory state. Even if a chunk deletion event is persisted, the chunk remains readable as long as memory holds its reference.
4. This ensures conflict-free read/write operations: a read operation acquires a chunk reference, guaranteeing the chunk's validity until the read completes.
4. Use `Arc` to manage ownership of chunk position:
1. For allocation, returns an `Arc<ChunkPos>`. If persistence fails, the position is automatically released when the `Arc` is dropped.
2. Read operations also return an `Arc<ChunkPos>`, ensuring safe data access even during concurrent writes or deletions.
### Allocator
Storage hierarchy:
1. **Chunk**: Basic data unit, currently proposed as 64KB, 512KB, and 4MB.
2. **Group**: Each group contains 256 chunks (16MB, 128MB, or 1GB depending on chunk size).
3. **File**: For 512KB chunks, a single file (~120GB) contains ~960 groups.
4. **Disk**: Single disk capacity of 30TB, divided into 256 files per chunk size.
5. **Node**: A single node contains 1020 disks.
This configuration supports up to ~1.2 billion chunks and ~5 million groups per machine.
Implementation details:
1. Each group uses a 256-bit bitset (4 `uint64_t`) to track allocation status.
2. Maintain three in-memory structures:
- `allocated_groups`: Groups with allocated space but no chunks assigned.
- `unallocated_groups`: Groups without allocated space.
- `active_groups`: Map of `<group_id, group_state>` tracking allocation status.
3. Chunk allocation workflow:
1. Prioritize finding free slots in `active_groups` using **`__builtin_ctz`** for fast bitwise operations.
2. If `active_groups` is empty, acquire a new group from `allocated_groups`.
3. If `allocated_groups` is empty, fetch a group from `unallocated_groups` and allocate disk space synchronously.
4. Background threads:
- **`allocate_thread`**: Maintains `active_groups` within a target size range to ensure in-memory allocation efficiency.
- **`compact_thread`**: Periodically scans `active_groups`, migrates all chunks from selected groups, releases space, and returns groups to `allocated_groups`.
### MetaStore
Persists three mappings:
1. **`chunk_id -> chunk_meta`**: Metadata includes chunk location, length, hash, version, etc., serialized using **`derse`**.
2. **`group_id -> group_state`**: Tracks chunk allocation status within groups, leveraging RocksDB's **MergeOp** for atomic updates.
3. **`chunk_pos -> chunk_id`**: Maps physical positions to chunk IDs, used by `compact_thread` during chunk migration.
### Chunk Engine
1. **MetaCache**: Maintains an in-memory `chunk_id -> chunk_info` mapping, where `chunk_info` includes `chunk_meta` and `Arc<ChunkPos>`.
2. **Read operation**: Returns `chunk_info`. The `Arc<ChunkPos>` ensures safe data access until the read completes.
3. **Write operation workflow**:
1. Query `MetaCache` to retrieve the current `chunk_info`.
2. Invoke `Allocator::allocate()` to obtain a new chunk position.
3. Read existing chunk data, write it to the new chunk position, append the new write request, and generate `new_chunk_info`.
4. Persist `new_chunk_info` to the **MetaStore** along with a release record for the original chunk position.

View File

@@ -0,0 +1,42 @@
use chunk_engine::*;
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use std::sync::Arc;
fn allocate(allocator: &Arc<Allocator>, n: usize) {
for _ in 0..n {
drop(allocator.allocate(true).unwrap());
}
}
fn criterion_benchmark(c: &mut Criterion) {
let dir = tempfile::tempdir().unwrap();
let cluster_config = ClustersConfig {
path: dir.path().into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
let clusters = Clusters::open(&cluster_config).unwrap();
let meta_store_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().join("meta"),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = std::sync::Arc::new(MetaStore::open(&meta_store_config).unwrap());
let allocator = chunk_engine::Allocator::load(clusters, meta_store.iterator()).unwrap();
allocator.do_allocate_task(1, 1, &meta_store).unwrap();
let count: usize = 1 << 16;
c.bench_with_input(BenchmarkId::new("allocate", count), &count, |b, &c| {
b.iter(|| allocate(&allocator, c))
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -0,0 +1,4 @@
fn main() {
let _ = cxx_build::bridge("src/cxx.rs");
println!("cargo:rerun-if-changed=src/cxx.rs");
}

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 77 KiB

View File

@@ -0,0 +1,94 @@
use chunk_engine::*;
use clap::Parser;
use derse::Deserialize;
use std::{
collections::{BTreeMap, HashMap},
path::PathBuf,
sync::Arc,
};
/// A distributed copy/move tool.
#[derive(Parser, Debug, Clone)]
#[command(version, about, long_about = None)]
pub struct Args {
/// Path to rocksdb.
pub path: PathBuf,
}
fn main() -> Result<()> {
let args = Args::parse();
let meta_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: args.path,
create: false,
read_only: true,
},
prefix_len: 4,
};
let meta_store = MetaStore::open(&meta_config)?;
let mut chunk_allocators = HashMap::new();
let mut used_map = BTreeMap::new();
let mut reversed_map = BTreeMap::new();
let mut group_count = BTreeMap::new();
let mut chunk_size = CHUNK_SIZE_SMALL;
let mut real_map = BTreeMap::new();
loop {
let counter = Arc::new(AllocatorCounter::new(chunk_size));
let it = meta_store.iterator();
let chunk_allocator = ChunkAllocator::load(it, counter.clone(), chunk_size)?;
let allocated_chunks = counter.allocated_chunks();
let reserved_chunks = counter.reserved_chunks();
used_map.insert(chunk_size, allocated_chunks - reserved_chunks);
reversed_map.insert(chunk_size, reserved_chunks);
group_count.insert(
chunk_size,
(
chunk_allocator.full_groups.len(),
chunk_allocator.active_groups.len(),
),
);
real_map.insert(chunk_size, 0u64);
chunk_allocators.insert(chunk_size, chunk_allocator);
if chunk_size >= CHUNK_SIZE_ULTRA {
break;
}
chunk_size *= 2;
}
let mut it = meta_store.iterator();
let end_key = MetaKey::chunk_meta_key_prefix();
it.seek(&end_key)?;
if it.key() == Some(end_key.as_ref()) {
it.next(); // [begin, end)
}
loop {
if !it.valid() {
break;
}
if it.key().unwrap()[0] != MetaKey::CHUNK_META_KEY_PREFIX {
break;
}
let chunk_meta =
ChunkMeta::deserialize(it.value().unwrap()).map_err(Error::SerializationError)?;
let chunk_size = chunk_meta.pos.chunk_size();
let allocator = chunk_allocators.get_mut(&chunk_size).unwrap();
allocator.reference(chunk_meta.pos, true);
real_map.entry(chunk_size).and_modify(|v| *v += 1);
it.next();
}
println!("{:#?}", used_map);
println!("{:#?}", reversed_map);
println!("{:#?}", group_count);
assert_eq!(used_map, real_map);
Ok(())
}

View File

@@ -0,0 +1,258 @@
use super::super::*;
use std::sync::{Arc, Mutex};
pub struct Allocator {
allocator: Mutex<ChunkAllocator>,
pub counter: Arc<AllocatorCounter>,
pub clusters: Clusters,
}
impl Allocator {
pub fn load(clusters: Clusters, it: RocksDBIterator) -> Result<Arc<Allocator>> {
let counter = Arc::new(AllocatorCounter::new(clusters.chunk_size));
Ok(Arc::new(Self {
allocator: Mutex::new(ChunkAllocator::load(
it,
counter.clone(),
clusters.chunk_size,
)?),
counter,
clusters,
}))
}
pub fn allocate(self: &Arc<Self>, allow_to_allocate: bool) -> Result<Chunk> {
let this = self.as_ref();
let mut allocator = this.allocator.lock().unwrap();
allocator
.allocate(&this.clusters, allow_to_allocate)
.map(|pos| {
Chunk::new(
ChunkMeta {
pos,
..Default::default()
},
self.clone(),
)
})
}
pub fn reference(self: &Arc<Self>, meta: ChunkMeta, first_ref: bool) -> Chunk {
let mut allocator = self.allocator.lock().unwrap();
allocator.reference(meta.pos, first_ref);
Chunk::new(meta, self.clone())
}
pub fn dereference(&self, pos: Position) {
let mut allocator = self.allocator.lock().unwrap();
allocator.dereference(pos)
}
pub fn get_allocate_task(&self, min_remain: usize, max_remain: usize) -> AllocateTask {
let mut allocator = self.allocator.lock().unwrap();
allocator
.group_allocator
.get_allocate_task(min_remain, max_remain)
}
pub fn finish_allocate_task(&self, task: AllocateTask, succ: bool) {
let mut allocator = self.allocator.lock().unwrap();
allocator.group_allocator.finish_allocate_task(task, succ);
}
pub fn do_allocate_task(
&self,
min_remain: usize,
max_remain: usize,
meta_store: &MetaStore,
) -> Result<AllocateTask> {
let task = self.get_allocate_task(min_remain, max_remain);
let result = match task {
AllocateTask::None => return Ok(task),
AllocateTask::Allocate(group_id) => (|| {
self.clusters.allocate(group_id)?;
meta_store.allocate_group(group_id)
})(),
AllocateTask::Deallocate(group_id) => (|| {
tracing::warn!("deallocate group: {:?}", group_id);
meta_store.remove_group(group_id)?;
self.clusters.deallocate(group_id)
})(),
};
self.finish_allocate_task(task, result.is_ok());
result?;
Ok(task)
}
pub fn get_compact_task(&self, max_reserved: u64) -> Option<GroupId> {
let mut allocator = self.allocator.lock().unwrap();
allocator.get_compact_task(max_reserved)
}
pub fn finish_compact_task(&self, group_id: GroupId) {
let mut allocator = self.allocator.lock().unwrap();
allocator.finish_compact_task(group_id)
}
}
impl Drop for Allocator {
fn drop(&mut self) {
tracing::info!("Allocator {:?} is dropping...", self.clusters);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_allocator() {
use rand::seq::SliceRandom;
let dir = tempfile::tempdir().unwrap();
let cluster_config = ClustersConfig {
path: dir.path().into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
let clusters = Clusters::open(&cluster_config).unwrap();
let meta_store_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().join("meta"),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = Arc::new(MetaStore::open(&meta_store_config).unwrap());
let allocator = Allocator::load(clusters, meta_store.iterator()).unwrap();
for _ in 0..10000 {
let chunk = allocator.allocate(true).unwrap();
assert_eq!(chunk.meta().pos, Position::new(GroupId::default(), 0));
}
const N: usize = 1000;
let mut chunks = vec![];
for _ in 0..N {
let chunk = allocator.allocate(true).unwrap();
chunks.push(std::sync::Arc::new(chunk));
}
{
let allocator = allocator.allocator.lock().unwrap();
assert_eq!(allocator.full_groups.len(), N / 256);
assert_eq!(allocator.active_groups.len(), 1);
assert_eq!(
allocator.active_groups.iter().next().unwrap().1.count() as usize,
N % 256
);
}
const T: usize = 8;
(0..T)
.map(|i| {
let chunks = chunks.clone();
std::thread::spawn(move || {
let mut vec = create_aligned_vec(ALIGN_SIZE);
vec.fill(0);
for chunk in chunks.iter() {
if chunk.meta().pos.index() as usize % T == i {
vec.fill(chunk.meta().pos.index());
chunk.pwrite(&vec[..], 0).unwrap();
}
}
})
})
.collect::<Vec<_>>()
.into_iter()
.for_each(|t| t.join().unwrap());
chunks.shuffle(&mut rand::thread_rng());
(0..T)
.map(|i| {
let chunks = chunks.clone();
std::thread::spawn(move || {
let mut buf = [0u8; 8];
for chunk in chunks.iter() {
if chunk.meta().pos.index() as usize % T == i {
assert!(chunk.pread(&mut buf, 0).is_ok());
assert_eq!(buf, [chunk.meta().pos.index(); 8]);
}
}
})
})
.collect::<Vec<_>>()
.into_iter()
.for_each(|t| t.join().unwrap());
chunks.clear();
{
let allocator = allocator.allocator.lock().unwrap();
assert!(allocator.full_groups.is_empty());
assert!(allocator.active_groups.is_empty());
}
}
#[test]
fn test_allocator_do_allocate_task() {
let dir = tempfile::tempdir().unwrap();
const S: Size = CHUNK_SIZE_NORMAL;
let cluster_config = ClustersConfig {
path: dir.path().into(),
chunk_size: S,
create: true,
};
let clusters = Clusters::open(&cluster_config).unwrap();
let meta_store_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().join("meta"),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = Arc::new(MetaStore::open(&meta_store_config).unwrap());
let allocator = Allocator::load(clusters, meta_store.iterator()).unwrap();
for _ in 0..4 {
assert!(matches!(
allocator.do_allocate_task(4, 8, &meta_store).unwrap(),
AllocateTask::Allocate(_)
));
}
assert!(matches!(
allocator.do_allocate_task(4, 8, &meta_store).unwrap(),
AllocateTask::None
));
let s = allocator.counter.used_size();
assert_eq!(s.allocated_size, S * GroupState::TOTAL_BITS as u64 * 4);
assert_eq!(s.reserved_size, S * GroupState::TOTAL_BITS as u64 * 4);
for _ in 2..4 {
assert!(matches!(
allocator.do_allocate_task(1, 2, &meta_store).unwrap(),
AllocateTask::Deallocate(_)
));
}
assert!(matches!(
allocator.do_allocate_task(1, 2, &meta_store).unwrap(),
AllocateTask::None
));
let s = allocator.counter.used_size();
assert_eq!(s.allocated_size, S * GroupState::TOTAL_BITS as u64 * 2);
assert_eq!(s.reserved_size, S * GroupState::TOTAL_BITS as u64 * 2);
}
}

View File

@@ -0,0 +1,88 @@
use super::super::*;
use std::sync::atomic::{AtomicU64, Ordering};
#[derive(Default)]
pub struct AllocatorCounter {
pub chunk_size: Size,
pub allocated_chunks: AtomicU64,
pub reserved_chunks: AtomicU64,
pub position_count: AtomicU64,
pub position_rc: AtomicU64,
}
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
#[repr(C)]
pub struct UsedSize {
pub allocated_size: Size,
pub reserved_size: Size,
pub position_count: u64,
pub position_rc: u64,
}
impl std::iter::Sum for UsedSize {
fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
let mut s = UsedSize::default();
for i in iter {
s.allocated_size += i.allocated_size;
s.reserved_size += i.reserved_size;
s.position_count += i.position_count;
s.position_rc += i.position_rc;
}
s
}
}
impl AllocatorCounter {
pub fn new(chunk_size: Size) -> Self {
Self {
chunk_size,
..Default::default()
}
}
pub fn allocated_chunks(&self) -> u64 {
self.allocated_chunks.load(Ordering::Acquire)
}
pub fn reserved_chunks(&self) -> u64 {
self.reserved_chunks.load(Ordering::Acquire)
}
pub fn used_size(&self) -> UsedSize {
UsedSize {
allocated_size: self.allocated_chunks() * self.chunk_size,
reserved_size: self.reserved_chunks() * self.chunk_size,
position_count: self.position_count.load(Ordering::Acquire),
position_rc: self.position_rc.load(Ordering::Acquire),
}
}
pub fn init(&self, allocated_count: u64, reserved_count: u64) {
self.allocated_chunks
.store(allocated_count, Ordering::Release);
self.reserved_chunks
.store(reserved_count, Ordering::Release);
}
pub fn allocate_group(&self) {
self.allocated_chunks
.fetch_add(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
self.reserved_chunks
.fetch_add(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
}
pub fn deallocate_group(&self) {
self.allocated_chunks
.fetch_sub(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
self.reserved_chunks
.fetch_sub(GroupState::TOTAL_BITS as u64, Ordering::SeqCst);
}
pub fn allocate_chunk(&self) {
self.reserved_chunks.fetch_sub(1, Ordering::SeqCst);
}
pub fn deallocate_chunk(&self) {
self.reserved_chunks.fetch_add(1, Ordering::SeqCst);
}
}

View File

@@ -0,0 +1,200 @@
use super::super::*;
use std::path::Path;
use std::sync::Arc;
#[derive(Clone)]
pub struct Allocators {
pub vec: [Arc<Allocator>; CHUNK_SIZE_NUMBER],
meta_store: Arc<MetaStore>,
}
impl Allocators {
pub fn new(path: &Path, create: bool, meta_store: Arc<MetaStore>) -> Result<Self> {
let mut allocators = vec![];
for i in 0..CHUNK_SIZE_NUMBER {
let chunk_size = CHUNK_SIZE_SMALL * (1 << i);
let allocator = Self::create(path, create, &meta_store, chunk_size)?;
allocators.push(allocator);
}
Ok(Self {
vec: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10].map(|i| allocators[i].clone()),
meta_store,
})
}
fn create(
path: &Path,
create: bool,
meta_store: &Arc<MetaStore>,
chunk_size: Size,
) -> Result<Arc<Allocator>> {
let cluster_config = ClustersConfig {
path: path.join(chunk_size.to_string()),
chunk_size,
create,
};
let clusters = Clusters::open(&cluster_config)?;
let allocator = Allocator::load(clusters, meta_store.iterator())?;
tracing::info!("Allocator {:?} is created...", allocator.clusters);
Result::Ok(allocator)
}
pub fn select_by_pos(&self, pos: Position) -> Result<&Arc<Allocator>> {
let chunk_size = pos.chunk_size();
if chunk_size.is_power_of_two()
&& CHUNK_SIZE_SMALL <= chunk_size
&& chunk_size <= CHUNK_SIZE_ULTRA
{
Ok(&self.vec[chunk_size.trailing_zeros() as usize - CHUNK_SIZE_SHIFT])
} else {
Err(Error::InvalidArg(format!(
"select allocator invalid pos: {pos:?}"
)))
}
}
pub fn select_by_size(&self, size: Size) -> Result<&Arc<Allocator>> {
if size <= CHUNK_SIZE_SMALL {
Ok(&self.vec[0])
} else if size <= CHUNK_SIZE_ULTRA {
Ok(&self.vec[size.next_power_of_two().trailing_zeros() as usize - CHUNK_SIZE_SHIFT])
} else {
Err(Error::InvalidArg(format!(
"select allocator invalid size: {size:?}"
)))
}
}
pub fn allocate(&self, size: Size, allow_to_allocate: bool) -> Result<Chunk> {
let allocator = self.select_by_size(size)?;
allocator.allocate(allow_to_allocate)
}
pub fn allocate_groups(
&self,
min_remain: usize,
max_remain: usize,
batch_size: usize,
allocate_ultra_groups: bool,
) -> usize {
let mut finish = 0usize;
for allocator in &self.vec {
let is_ultra = allocator.clusters.chunk_size > CHUNK_SIZE_LARGE;
if is_ultra != allocate_ultra_groups {
continue;
}
for _ in 0..batch_size {
match allocator.do_allocate_task(min_remain, max_remain, &self.meta_store) {
Ok(AllocateTask::None) => break,
Ok(_) => {
finish += 1;
continue;
}
Err(_) => break,
}
}
}
finish
}
pub fn used_size(&self) -> UsedSize {
self.vec
.iter()
.map(|allocator| allocator.counter.used_size())
.sum()
}
pub fn get_allocate_tasks(&self, max_reserved: u64) -> tinyvec::ArrayVec<[GroupId; 3]> {
self.vec
.iter()
.filter_map(|allocator| allocator.get_compact_task(max_reserved))
.collect()
}
pub fn finish_compact_task(&self, group_id: GroupId) {
self.select_by_pos(Position::new(group_id, 0))
.unwrap()
.finish_compact_task(group_id);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_allocators() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path();
let meta_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: path.join("meta"),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = Arc::new(MetaStore::open(&meta_config).unwrap());
let allocators = Allocators::new(path, true, meta_store).unwrap();
assert_eq!(
allocators
.select_by_pos(Position::new(GroupId::new(CHUNK_SIZE_NORMAL, 0, 0), 0))
.unwrap()
.clusters
.chunk_size,
CHUNK_SIZE_NORMAL
);
assert_eq!(
allocators
.select_by_size(CHUNK_SIZE_SMALL)
.unwrap()
.clusters
.chunk_size,
CHUNK_SIZE_SMALL
);
assert_eq!(
allocators
.select_by_size(CHUNK_SIZE_SMALL + 1)
.unwrap()
.clusters
.chunk_size,
CHUNK_SIZE_SMALL * 2,
);
assert_eq!(
allocators
.select_by_size(CHUNK_SIZE_NORMAL)
.unwrap()
.clusters
.chunk_size,
CHUNK_SIZE_NORMAL
);
assert_eq!(
allocators
.select_by_size(CHUNK_SIZE_NORMAL + 1)
.unwrap()
.clusters
.chunk_size,
CHUNK_SIZE_NORMAL * 2,
);
let used_size = allocators.used_size();
assert_eq!(used_size.allocated_size, 0);
assert_eq!(used_size.reserved_size, 0);
assert!(allocators
.select_by_pos(Position::new(GroupId::new(CHUNK_SIZE_ULTRA, 0, 0), 0))
.is_ok());
assert!(allocators
.select_by_pos(Position::new(GroupId::new(Size::gibibyte(1), 0, 0), 0))
.is_err());
assert!(allocators.select_by_size(Size::gibibyte(1)).is_err());
}
}

View File

@@ -0,0 +1,312 @@
use super::super::*;
use lazy_static::lazy_static;
use rand::Rng;
use std::cell::RefCell;
use std::sync::atomic::Ordering;
use std::sync::Arc;
pub struct Chunk {
meta: ChunkMeta,
allocator: Arc<Allocator>,
}
pub type ChunkArc = Arc<Chunk>;
lazy_static! {
static ref ZERO: Vec<u8> = {
let mut vec = create_aligned_vec(CHUNK_SIZE_ULTRA);
vec.fill(0);
vec
};
}
impl Chunk {
thread_local! {
static BUFFER: RefCell<Vec<u8>> = RefCell::new(create_aligned_vec(CHUNK_SIZE_ULTRA));
}
pub fn new(meta: ChunkMeta, allocator: Arc<Allocator>) -> Self {
Self { meta, allocator }
}
pub fn meta(&self) -> &ChunkMeta {
&self.meta
}
pub fn capacity(&self) -> u32 {
self.meta.pos.chunk_size().into()
}
pub fn update_meta(&mut self, req: &UpdateReq) {
self.meta.chunk_ver = req.out_commit_ver;
self.meta.chain_ver = req.chain_ver;
self.meta.last_request_id = req.last_request_id;
self.meta.last_client_low = req.last_client_low;
self.meta.last_client_high = req.last_client_high;
if req.desired_tag.is_empty() {
let r: u64 = rand::thread_rng().gen();
self.meta.etag = ETag::from(format!("{:X}", r).as_bytes());
} else {
self.meta.etag = req.desired_tag.into();
}
self.meta.uncommitted = true;
self.meta.timestamp = ChunkMeta::now();
}
pub fn set_chain_ver(&mut self, chain_ver: u32) {
self.meta.chain_ver = chain_ver;
}
pub fn set_committed(&mut self) {
self.meta.uncommitted = false;
}
pub fn copy_chunk(&self) -> Result<Chunk> {
// 1. allocate new chunk.
let mut new_chunk = self.allocator.allocate(true)?;
// 2. copy meta.
new_chunk.meta = ChunkMeta {
pos: new_chunk.meta.pos,
etag: Default::default(),
..self.meta
};
// 3. copy data.
Self::BUFFER.with(|v| {
let mut vec = v.borrow_mut();
let len = self.meta.len.next_multiple_of(ALIGN_SIZE.into());
let buf = &mut vec[..len as usize]; // aligned.
self.pread(buf, 0)?;
new_chunk.pwrite(buf, 0)?;
Result::Ok(())
})?;
Ok(new_chunk)
}
pub fn copy_on_write(
&self,
data: &[u8],
offset: u32,
checksum: u32,
is_syncing: bool,
allow_to_allocate: bool,
allocators: &Allocators,
metrics: &Metrics,
) -> Result<Chunk> {
// 1. allocate new chunk.
let new_len = std::cmp::max(self.meta.len, offset + data.len() as u32);
let begin = std::time::Instant::now();
let mut new_chunk = allocators.allocate(Size::from(new_len), allow_to_allocate)?;
let begin2 = std::time::Instant::now();
let latency = begin2.duration_since(begin).as_micros() as _;
metrics.allocate_times.fetch_add(1, Ordering::AcqRel);
metrics
.allocate_latency
.fetch_add(latency, Ordering::AcqRel);
metrics.copy_on_write_times.fetch_add(1, Ordering::AcqRel);
// 2. write data.
let skip_read = is_syncing || (offset == 0 && data.len() >= self.meta.len as usize);
let checksum = Self::BUFFER.with(|v| {
let mut vec = v.borrow_mut();
if !skip_read {
// aligned read.
let len = self.meta.len.next_multiple_of(ALIGN_SIZE.into());
let begin = std::time::Instant::now();
self.pread(&mut vec[..len as usize], 0)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
metrics
.copy_on_write_read_times
.fetch_add(1, Ordering::AcqRel);
metrics
.copy_on_write_read_bytes
.fetch_add(len as _, Ordering::AcqRel);
metrics
.copy_on_write_read_latency
.fetch_add(latency, Ordering::AcqRel);
}
// aligned write.
if skip_read && is_aligned_io(data, offset) {
let begin = std::time::Instant::now();
new_chunk.pwrite(data, offset)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
} else {
if self.meta.len < offset {
vec[self.meta.len as usize..offset as usize].fill(0);
}
vec[offset as usize..][..data.len()].copy_from_slice(data);
let len = new_len.next_multiple_of(ALIGN_SIZE.into());
let begin = std::time::Instant::now();
new_chunk.pwrite(&vec[..len as usize], 0)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
};
Result::Ok(if skip_read {
metrics.checksum_reuse.fetch_add(1, Ordering::AcqRel);
checksum
} else {
metrics.checksum_recalculate.fetch_add(1, Ordering::AcqRel);
crc32c::crc32c(&vec[..new_len as usize])
})
})?;
let latency = std::time::Instant::now().duration_since(begin2).as_micros() as _;
metrics
.copy_on_write_latency
.fetch_add(latency, Ordering::AcqRel);
// 3. copy meta.
new_chunk.meta.len = if is_syncing {
offset + data.len() as u32
} else {
new_len
};
new_chunk.meta.checksum = checksum;
Ok(new_chunk)
}
pub fn safe_write(
&mut self,
data: &[u8],
offset: u32,
checksum: u32,
truncate: bool,
metrics: &Metrics,
) -> Result<()> {
if truncate && offset < self.meta.len {
metrics
.safe_write_truncate_shorten
.fetch_add(1, Ordering::AcqRel);
metrics.checksum_recalculate.fetch_add(1, Ordering::AcqRel);
return Self::BUFFER.with(|v| {
// aligned read.
let mut vec = v.borrow_mut();
let len = offset.next_multiple_of(ALIGN_SIZE.into());
self.pread(&mut vec[..len as usize], 0)?;
self.meta.len = offset;
self.meta.checksum = crc32c::crc32c(&vec[..offset as usize]);
Result::Ok(())
});
}
if is_aligned_len(self.meta.len)
&& is_aligned_len(offset)
&& (data.is_empty() || is_aligned_buf(data))
{
// already aligned.
if offset > self.meta.len {
let padding = (offset - self.meta.len) as usize;
let begin = std::time::Instant::now();
self.pwrite(&ZERO[..padding], self.meta.len)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
self.meta.len = offset;
self.meta.checksum = crc32c::crc32c_append(self.meta.checksum, &ZERO[..padding]);
metrics
.safe_write_truncate_extend
.fetch_add(1, Ordering::AcqRel);
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
}
if !data.is_empty() {
assert!(offset == self.meta.len);
let begin = std::time::Instant::now();
self.pwrite(data, offset)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as u64;
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
self.meta.len = offset + data.len() as u32;
self.meta.checksum =
crc32c::crc32c_combine(self.meta.checksum, checksum, data.len());
metrics
.safe_write_direct_append
.fetch_add(1, Ordering::AcqRel);
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
}
} else if self.meta.len < offset + data.len() as u32 {
// copy to buffer and write.
assert!(self.meta.len <= offset);
Self::BUFFER.with(|v| {
let mut vec = v.borrow_mut();
let start = self.meta.len & !(ALIGN_SIZE.0 as u32 - 1);
if start != self.meta.len {
metrics
.safe_write_read_tail_times
.fetch_add(1, Ordering::AcqRel);
metrics
.safe_write_read_tail_bytes
.fetch_add(ALIGN_SIZE.0, Ordering::AcqRel);
self.pread(&mut vec[start as usize..][..ALIGN_SIZE.into()], start)?;
}
if self.meta.len < offset {
metrics
.safe_write_truncate_extend
.fetch_add(1, Ordering::AcqRel);
vec[self.meta.len as usize..offset as usize].fill(0);
}
vec[offset as usize..][..data.len()].copy_from_slice(data);
let new_len = offset as usize + data.len();
let begin = std::time::Instant::now();
self.pwrite(
&vec[start as usize..new_len.next_multiple_of(ALIGN_SIZE.into())],
start,
)?;
let latency = std::time::Instant::now().duration_since(begin).as_micros() as _;
metrics.pwrite_times.fetch_add(1, Ordering::AcqRel);
metrics.pwrite_latency.fetch_add(latency, Ordering::AcqRel);
self.meta.checksum = crc32c::crc32c_append(
self.meta.checksum,
&vec[self.meta.len as usize..new_len],
);
metrics
.safe_write_indirect_append
.fetch_add(1, Ordering::AcqRel);
metrics.checksum_combine.fetch_add(1, Ordering::AcqRel);
self.meta.len = new_len as u32;
Result::Ok(())
})?;
} else {
assert!(data.is_empty());
}
Ok(())
}
pub fn pread(&self, buf: &mut [u8], offset: u32) -> Result<()> {
self.allocator.clusters.pread(self.meta.pos, buf, offset)
}
pub(super) fn pwrite(&self, buf: &[u8], offset: u32) -> Result<()> {
self.allocator.clusters.pwrite(self.meta.pos, buf, offset)
}
pub fn fd_and_offset(&self) -> FdAndOffset {
self.allocator.clusters.fd_and_offset(self.meta.pos)
}
}
impl Clone for Chunk {
fn clone(&self) -> Self {
self.allocator.reference(self.meta.clone(), false)
}
}
impl Drop for Chunk {
fn drop(&mut self) {
self.allocator.dereference(self.meta.pos);
}
}
impl std::fmt::Debug for Chunk {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(&self.meta, f)
}
}

View File

@@ -0,0 +1,304 @@
use super::super::*;
use std::collections::hash_map::Entry;
use std::sync::atomic::Ordering;
use std::sync::Arc;
pub struct ChunkAllocator {
pub full_groups: ShardsSet<GroupId>,
pub active_groups: ShardsMap<GroupId, GroupState>,
pub(super) active_levels: [ShardsSet<GroupId>; GroupState::LEVELS],
pub(super) frozen_groups: ShardsMap<GroupId, GroupState>,
pub(super) group_allocator: GroupAllocator,
pub(super) position_rc: ShardsMap<Position, u32>,
pub(super) counter: Arc<AllocatorCounter>,
}
impl ChunkAllocator {
pub fn with_chunk_size(chunk_size: Size) -> Self {
let counter = Arc::new(AllocatorCounter::new(chunk_size));
Self {
full_groups: Default::default(),
active_groups: Default::default(),
active_levels: Default::default(),
frozen_groups: Default::default(),
group_allocator: GroupAllocator::init(counter.clone()),
position_rc: Default::default(),
counter,
}
}
pub fn load(
mut it: RocksDBIterator,
counter: Arc<AllocatorCounter>,
chunk_size: Size,
) -> Result<Self> {
let mut full_groups = ShardsSet::with_capacity(4096);
let mut active_groups = ShardsMap::with_capacity(4096);
let frozen_groups = ShardsMap::with_capacity(4096);
let mut active_levels = std::array::from_fn(|_| ShardsSet::with_capacity(4096));
let mut allocated_groups = ShardsSet::with_capacity(4096);
let mut unallocated_groups = ShardsSet::with_capacity(4096);
let mut current = GroupId::new(chunk_size, 0, 0);
let mut allocated_count: u64 = 0;
let mut reserved_count: u64 = 0;
let prefix = MetaKey::group_bits_chunk_size_prefix(current);
it.iterate(prefix, |key, value| {
let group_id = MetaKey::parse_group_bits_key(key)?;
let group_state = GroupState::from(value)?;
assert!(
current <= group_id,
"current {current:?} > next {group_id:?}"
);
while current < group_id {
unallocated_groups.insert(current);
current.next();
}
current.next();
allocated_count += GroupState::TOTAL_BITS as u64;
if group_state.is_empty() {
allocated_groups.insert(group_id);
reserved_count += GroupState::TOTAL_BITS as u64;
} else if group_state.is_full() {
full_groups.insert(group_id);
} else {
reserved_count += GroupState::TOTAL_BITS as u64 - group_state.count() as u64;
active_levels[group_state.level() as usize].insert(group_id);
active_groups.insert(group_id, group_state);
}
Ok(())
})?;
counter.init(allocated_count, reserved_count);
let chunk_allocator = ChunkAllocator {
full_groups,
active_groups,
active_levels,
frozen_groups,
counter: counter.clone(),
group_allocator: GroupAllocator {
allocated_groups,
unallocated_groups,
next_group_id: current,
counter,
},
position_rc: ShardsMap::with_capacity(1 << 20),
};
Ok(chunk_allocator)
}
pub fn allocate(&mut self, clusters: &Clusters, allow_to_allocate: bool) -> Result<Position> {
if !self.active_groups.is_empty() {
for level in (0..GroupState::LEVELS).rev() {
let set = &mut self.active_levels[level];
if let Some(&group_id) = set.iter().next() {
let state = self.active_groups.get_mut(&group_id).unwrap();
let index = state.allocate().unwrap();
if state.is_full() {
self.full_groups.insert(group_id);
self.active_groups.remove(&group_id);
set.remove(&group_id);
} else if state.level() != level as u32 {
set.remove(&group_id);
self.active_levels[level + 1].insert(group_id);
}
let pos = Position::new(group_id, index);
self.reference(pos, true);
self.counter.allocate_chunk();
return Ok(pos);
}
}
}
let group_id = self.group_allocator.allocate(clusters, allow_to_allocate)?;
self.counter.allocate_chunk();
let state = match self.active_groups.entry(group_id) {
Entry::Occupied(_) => panic!("should not be active groups: {:?}", group_id),
Entry::Vacant(entry) => entry.insert(GroupState::empty()),
};
let index = state.allocate().unwrap();
self.active_levels[state.level() as usize].insert(group_id);
let pos = Position::new(group_id, index);
self.reference(pos, true);
Ok(pos)
}
pub fn reference(&mut self, pos: Position, first_ref: bool) {
let group_id = pos.group_id();
if let Some(state) = self.active_groups.get_mut(&group_id) {
assert!(state.check(pos.index()), "ref pos failed: {:?}", pos);
} else if let Some(state) = self.frozen_groups.get_mut(&group_id) {
assert!(state.check(pos.index()), "ref pos failed: {:?}", pos);
} else {
assert!(self.full_groups.contains(&group_id));
}
let rc = match self.position_rc.entry(pos) {
Entry::Occupied(mut occupied_entry) => {
let rc = occupied_entry.get_mut();
*rc += 1;
*rc
}
Entry::Vacant(vacant_entry) => {
self.counter.position_count.fetch_add(1, Ordering::AcqRel);
vacant_entry.insert(1);
1
}
};
self.counter.position_rc.fetch_add(1, Ordering::AcqRel);
if first_ref {
assert!(rc == 1, "should be first ref to pos {:?}, rc {}", pos, rc);
}
}
pub fn dereference(&mut self, pos: Position) {
self.counter.position_rc.fetch_sub(1, Ordering::AcqRel);
let count = self.position_rc.get_mut(&pos).unwrap();
*count -= 1;
if *count == 0 {
self.counter.position_count.fetch_sub(1, Ordering::AcqRel);
self.position_rc.remove(&pos);
self.deallocate(pos);
}
}
pub fn deallocate(&mut self, pos: Position) {
let group_id = pos.group_id();
if let Some(state) = self.active_groups.get_mut(&group_id) {
let level = state.level();
state.deallocate(pos.index()).unwrap();
if state.is_empty() {
self.active_groups.remove(&group_id);
self.active_levels[level as usize].remove(&group_id);
self.group_allocator.deallocate(group_id);
} else if state.level() != level {
self.active_levels[level as usize].remove(&group_id);
self.active_levels[level as usize - 1].insert(group_id);
}
} else if let Some(state) = self.frozen_groups.get_mut(&group_id) {
state.deallocate(pos.index()).unwrap();
if state.is_empty() {
self.frozen_groups.remove(&group_id);
self.group_allocator.deallocate(group_id);
}
} else if self.full_groups.contains(&group_id) {
let mut state = GroupState::full();
state.deallocate(pos.index()).unwrap();
self.active_levels[state.level() as usize].insert(group_id);
self.active_groups.insert(group_id, state);
self.full_groups.remove(&group_id);
} else {
unreachable!(
"deallocate position failed! not found this position: {:?}",
pos
);
}
self.counter.deallocate_chunk();
}
pub fn get_compact_task(&mut self, max_reserved: u64) -> Option<GroupId> {
let reserved = self.counter.reserved_chunks();
if reserved <= max_reserved {
return None;
}
for set in &mut self.active_levels {
if let Some(&group_id) = set.iter().next() {
set.remove(&group_id);
let state = self.active_groups.remove(&group_id).unwrap();
self.frozen_groups.insert(group_id, state);
return Some(group_id);
}
}
None
}
pub fn finish_compact_task(&mut self, group_id: GroupId) {
if let Some(state) = self.frozen_groups.remove(&group_id) {
self.active_levels[state.level() as usize].insert(group_id);
self.active_groups.insert(group_id, state);
tracing::info!("finish compact task and move back {:?}", group_id);
} else {
tracing::info!("finish compact task successful!");
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_allocator() {
let dir = tempfile::tempdir().unwrap();
let config = ClustersConfig {
path: dir.path().into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
let clusters = Clusters::open(&config).unwrap();
let mut chunk_allocator = ChunkAllocator::with_chunk_size(CHUNK_SIZE_NORMAL);
assert!(chunk_allocator.active_groups.is_empty());
assert!(chunk_allocator
.active_levels
.iter()
.all(|set| set.is_empty()));
assert!(chunk_allocator.full_groups.is_empty());
let one_level_count = GroupState::TOTAL_BITS / GroupState::LEVELS;
for i in 0..(one_level_count - 1) {
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
assert_eq!(pos, Position::new(GroupId::default(), i as _));
}
assert_eq!(chunk_allocator.active_groups.len(), 1);
assert_eq!(chunk_allocator.active_levels[0].len(), 1);
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
assert_eq!(
pos,
Position::new(GroupId::default(), one_level_count as u8 - 1)
);
assert_eq!(chunk_allocator.active_groups.len(), 1);
assert_eq!(chunk_allocator.active_levels[0].len(), 0);
assert_eq!(chunk_allocator.active_levels[1].len(), 1);
let used_size = chunk_allocator.counter.used_size();
assert_eq!(
used_size.allocated_size,
CHUNK_SIZE_NORMAL * GroupState::TOTAL_BITS
);
assert_eq!(
used_size.reserved_size,
CHUNK_SIZE_NORMAL * (GroupState::TOTAL_BITS - one_level_count)
);
for i in one_level_count..GroupState::TOTAL_BITS {
let pos = chunk_allocator.allocate(&clusters, true).unwrap();
assert_eq!(pos, Position::new(GroupId::default(), i as _));
}
assert!(chunk_allocator.active_groups.is_empty());
assert!(chunk_allocator
.active_levels
.iter()
.all(|set| set.is_empty()));
assert_eq!(chunk_allocator.full_groups.len(), 1);
}
#[test]
#[should_panic(expected = "not found this position")]
fn test_chunk_invalid_deallocate() {
let mut allocator = ChunkAllocator::with_chunk_size(CHUNK_SIZE_NORMAL);
allocator.deallocate(Position::default());
}
}

View File

@@ -0,0 +1,190 @@
use std::sync::Arc;
use super::super::*;
pub struct GroupAllocator {
pub(super) allocated_groups: ShardsSet<GroupId>,
pub(super) unallocated_groups: ShardsSet<GroupId>,
pub(super) next_group_id: GroupId,
pub(super) counter: Arc<AllocatorCounter>,
}
#[derive(Debug, Clone, Copy)]
pub enum AllocateTask {
None,
Allocate(GroupId),
Deallocate(GroupId),
}
impl GroupAllocator {
pub fn init(counter: Arc<AllocatorCounter>) -> Self {
Self {
allocated_groups: Default::default(),
unallocated_groups: Default::default(),
next_group_id: Default::default(),
counter,
}
}
pub fn allocate(&mut self, clusters: &Clusters, allow_to_allocate: bool) -> Result<GroupId> {
if let Some(&group_id) = self.allocated_groups.iter().next() {
self.allocated_groups.remove(&group_id);
Ok(group_id)
} else if allow_to_allocate {
let group_id = self.get_unallocated_group_id();
tracing::info!("allocate group slow path {:?}", group_id);
let result = clusters.allocate(group_id);
if let Err(err) = result {
self.unallocated_groups.insert(group_id);
return Err(err);
}
self.counter.allocate_group();
Ok(group_id)
} else {
Err(Error::NoSpace)
}
}
pub fn deallocate(&mut self, group_id: GroupId) {
self.allocated_groups.insert(group_id);
}
fn get_unallocated_group_id(&mut self) -> GroupId {
if let Some(&group_id) = self.unallocated_groups.iter().next() {
self.unallocated_groups.remove(&group_id);
group_id
} else {
let group_id = self.next_group_id;
self.next_group_id = self.next_group_id.plus_one();
group_id
}
}
pub fn get_allocate_task(&mut self, min_remain: usize, max_remain: usize) -> AllocateTask {
if self.allocated_groups.len() < min_remain {
AllocateTask::Allocate(self.get_unallocated_group_id())
} else if self.allocated_groups.len() > max_remain {
let group_id = *self.allocated_groups.iter().next().unwrap();
self.allocated_groups.remove(&group_id);
AllocateTask::Deallocate(group_id)
} else {
AllocateTask::None
}
}
pub fn finish_allocate_task(&mut self, task: AllocateTask, succ: bool) {
match (task, succ) {
(AllocateTask::Allocate(group_id), true) => {
self.counter.allocate_group();
self.allocated_groups.insert(group_id)
}
(AllocateTask::Deallocate(group_id), true) => {
self.counter.deallocate_group();
self.unallocated_groups.insert(group_id)
}
(AllocateTask::Allocate(group_id), false) => self.unallocated_groups.insert(group_id),
(AllocateTask::Deallocate(group_id), false) => self.allocated_groups.insert(group_id),
_ => false,
};
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_allocator() {
let dir = tempfile::tempdir().unwrap();
let config = ClustersConfig {
path: dir.path().into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
let clusters = Clusters::open(&config).unwrap();
let counter = Arc::new(AllocatorCounter::new(CHUNK_SIZE_NORMAL));
let mut group_allocator = GroupAllocator::init(counter);
let group_id_1 = group_allocator.allocate(&clusters, true).unwrap();
assert_eq!(group_id_1, GroupId::default());
assert_eq!(group_allocator.next_group_id, group_id_1.plus_one());
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
let group_id_2 = group_allocator.allocate(&clusters, true).unwrap();
assert_eq!(group_id_1.plus_one(), group_id_2);
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
group_allocator.deallocate(group_id_1);
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
assert_eq!(group_allocator.allocated_groups.len(), 1);
assert!(group_allocator.unallocated_groups.is_empty());
let group_id_3 = group_allocator.allocate(&clusters, true).unwrap();
assert_eq!(group_id_1, group_id_3);
assert_eq!(group_allocator.next_group_id, group_id_2.plus_one());
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
group_allocator.allocate(&clusters, false).unwrap_err();
}
#[test]
fn test_group_allocator_task() {
let counter = Arc::new(AllocatorCounter::new(CHUNK_SIZE_NORMAL));
let mut group_allocator = GroupAllocator::init(counter);
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
assert_eq!(group_allocator.next_group_id.cluster(), 0);
let task = group_allocator.get_allocate_task(2, 4);
assert!(matches!(task, AllocateTask::Allocate(_)));
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
assert_eq!(group_allocator.next_group_id.cluster(), 1);
group_allocator.finish_allocate_task(task, false);
let task = group_allocator.get_allocate_task(2, 4);
assert!(matches!(task, AllocateTask::Allocate(_)));
assert!(group_allocator.allocated_groups.is_empty());
assert!(group_allocator.unallocated_groups.is_empty());
assert_eq!(group_allocator.next_group_id.cluster(), 1);
group_allocator.finish_allocate_task(task, true);
assert_eq!(group_allocator.allocated_groups.len(), 1);
assert_eq!(group_allocator.unallocated_groups.len(), 0);
assert_eq!(group_allocator.next_group_id.cluster(), 1);
let task = group_allocator.get_allocate_task(2, 4);
assert!(matches!(task, AllocateTask::Allocate(_)));
group_allocator.finish_allocate_task(task, true);
assert_eq!(group_allocator.allocated_groups.len(), 2);
assert_eq!(group_allocator.unallocated_groups.len(), 0);
assert_eq!(group_allocator.next_group_id.cluster(), 2);
let task = group_allocator.get_allocate_task(2, 4);
assert!(matches!(task, AllocateTask::None));
group_allocator.finish_allocate_task(task, true);
assert_eq!(group_allocator.allocated_groups.len(), 2);
assert_eq!(group_allocator.unallocated_groups.len(), 0);
assert_eq!(group_allocator.next_group_id.cluster(), 2);
let task = group_allocator.get_allocate_task(3, 4);
assert!(matches!(task, AllocateTask::Allocate(_)));
group_allocator.finish_allocate_task(task, false);
assert_eq!(group_allocator.allocated_groups.len(), 2);
assert_eq!(group_allocator.unallocated_groups.len(), 1);
assert_eq!(group_allocator.next_group_id.cluster(), 3);
let task = group_allocator.get_allocate_task(1, 1);
assert!(matches!(task, AllocateTask::Deallocate(_)));
group_allocator.finish_allocate_task(task, false);
assert_eq!(group_allocator.allocated_groups.len(), 2);
assert_eq!(group_allocator.unallocated_groups.len(), 1);
assert_eq!(group_allocator.next_group_id.cluster(), 3);
}
}

View File

@@ -0,0 +1,27 @@
use std::sync::atomic::AtomicU64;
#[derive(Debug, Default)]
#[repr(C)]
pub struct Metrics {
pub copy_on_write_times: AtomicU64,
pub copy_on_write_latency: AtomicU64,
pub copy_on_write_read_bytes: AtomicU64,
pub copy_on_write_read_times: AtomicU64,
pub copy_on_write_read_latency: AtomicU64,
pub checksum_reuse: AtomicU64,
pub checksum_combine: AtomicU64,
pub checksum_recalculate: AtomicU64,
pub safe_write_direct_append: AtomicU64,
pub safe_write_indirect_append: AtomicU64,
pub safe_write_truncate_shorten: AtomicU64,
pub safe_write_truncate_extend: AtomicU64,
pub safe_write_read_tail_times: AtomicU64,
pub safe_write_read_tail_bytes: AtomicU64,
pub allocate_times: AtomicU64,
pub allocate_latency: AtomicU64,
pub pwrite_times: AtomicU64,
pub pwrite_latency: AtomicU64,
}

View File

@@ -0,0 +1,17 @@
mod allocator;
mod allocator_counter;
mod allocators;
mod chunk;
mod chunk_allocator;
mod group_allocator;
mod metrics;
mod writing_chunk;
pub use allocator::*;
pub use allocator_counter::*;
pub use allocators::*;
pub use chunk::*;
pub use chunk_allocator::*;
pub use group_allocator::*;
pub use metrics::*;
pub use writing_chunk::*;

View File

@@ -0,0 +1,124 @@
use crate::{Bytes, Chunk, ChunkArc, ChunkMeta};
use dashmap::DashMap;
use std::{collections::HashMap, sync::Arc};
pub struct WritingHolder {
pub chunk: Chunk,
pub abort: bool,
}
pub type WritingList = DashMap<Bytes, HashMap<Bytes, WritingHolder>>;
pub struct WritingChunk {
pub chunk_id: Bytes,
pub chunk: Chunk,
pub list: Arc<WritingList>,
pub prefix_len: u32,
pub is_remove: bool,
pub commit_succ: bool,
}
impl WritingChunk {
pub fn meta(&self) -> &ChunkMeta {
self.chunk.meta()
}
pub fn set_committed(&mut self) {
self.chunk.set_committed();
}
pub fn commit_succ(&mut self) {
self.commit_succ = true;
}
}
impl Drop for WritingChunk {
fn drop(&mut self) {
let prefix = &self.chunk_id[..self.prefix_len as usize];
if let Some(mut map) = self.list.get_mut(prefix) {
if self.commit_succ {
if map.remove(&self.chunk_id).is_some() {
return;
}
} else if let Some(holder) = map.get_mut(&self.chunk_id) {
holder.abort = true;
return;
}
}
panic!("chunk id {:?} is not in the writing list!", self.chunk_id);
}
}
impl From<&WritingChunk> for ChunkArc {
fn from(chunk: &WritingChunk) -> Self {
Arc::new(chunk.chunk.clone())
}
}
impl std::fmt::Debug for WritingChunk {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("WritingChunk")
.field("chunk_id", &self.chunk_id)
.field("chunk", &self.chunk)
.field("is_remove", &self.is_remove)
.finish()
}
}
#[cfg(test)]
mod tests {
use crate::*;
use std::sync::Arc;
fn test_writing_chunk_not_in_list(has_list: bool, commit_succ: bool) {
let dir = tempfile::tempdir().unwrap();
let path = dir.path();
let meta_config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: path.join("meta"),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = Arc::new(MetaStore::open(&meta_config).unwrap());
let allocators = Allocators::new(path, true, meta_store).unwrap();
let chunk = allocators.allocate(CHUNK_SIZE_NORMAL, true).unwrap();
let writing_list: Arc<WritingList> = Default::default();
if has_list {
writing_list
.entry(Bytes::from(b"te".as_slice()))
.or_default();
}
let writing_chunk = WritingChunk {
chunk_id: b"test".as_ref().into(),
chunk,
list: writing_list.clone(),
prefix_len: 2,
is_remove: false,
commit_succ,
};
println!("{:#?}", writing_chunk);
}
#[test]
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
fn test_writing_chunk_not_in_list_1() {
test_writing_chunk_not_in_list(false, false);
}
#[test]
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
fn test_writing_chunk_not_in_list_2() {
test_writing_chunk_not_in_list(true, false);
}
#[test]
#[should_panic(expected = "chunk id [116, 101, 115, 116] is not in the writing list!")]
fn test_writing_chunk_not_in_list_3() {
test_writing_chunk_not_in_list(true, true);
}
}

View File

@@ -0,0 +1,89 @@
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
use anyhow::{Context, Result};
use chunk_engine::*;
use serde::Deserialize;
#[derive(Debug, Default, Deserialize)]
struct Config {
engine: EngineConfig,
threads: usize,
count: usize,
level: String,
}
fn main() -> Result<()> {
let mut iter = std::env::args();
iter.next();
let config_path = iter
.next()
.ok_or(anyhow::anyhow!("get config path failed"))?;
let content = std::fs::read_to_string(&config_path)
.with_context(|| format!("failed to open config file {:?}", config_path))?;
let config: Config = toml::from_str(&content)
.with_context(|| format!("failed to parse config file {:?}", config_path))?;
let level = match config.level.as_str() {
"info" => tracing::Level::INFO,
"debug" => tracing::Level::DEBUG,
_ => tracing::Level::WARN,
};
tracing_subscriber::fmt().with_max_level(level).init();
tracing::info!("config content: {:#?}", config);
let engine = chunk_engine::Engine::open(&config.engine).unwrap();
engine.start_allocate_workers(2);
std::thread::sleep(std::time::Duration::from_millis(100));
let bytes = Arc::new(AtomicUsize::default());
let running = Arc::new(AtomicUsize::default());
let threads = (0..config.threads)
.map(|i| {
let engine = engine.clone();
let bytes = bytes.clone();
let running = running.clone();
let mut vec = create_aligned_vec(CHUNK_SIZE_NORMAL);
vec.fill(i as u8);
let checksum = crc32c::crc32c(&vec);
running.fetch_add(1, Ordering::SeqCst);
Ok(std::thread::spawn(move || {
let mut chunk_id: usize = i << 32;
for _ in 0..config.count {
engine
.write(&chunk_id.to_be_bytes(), &vec, 0, checksum)
.unwrap();
chunk_id += 1;
bytes.fetch_add(vec.len(), Ordering::SeqCst);
}
running.fetch_sub(1, Ordering::SeqCst);
}))
})
.collect::<Result<Vec<_>>>()?;
while running.load(Ordering::Acquire) > 0 {
std::thread::sleep(std::time::Duration::from_secs(1));
let bytes = bytes.swap(0, Ordering::Acquire);
let used_size = engine.used_size();
tracing::info!(
"throughput: {:?}/s, allocated: {:?}, reserved: {:?}",
Size::from(bytes),
used_size.allocated_size,
used_size.reserved_size,
);
}
for thread in threads {
thread.join().unwrap();
}
engine.stop_and_join();
engine.speed_up_quit();
Ok(())
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
mod engine;
pub use engine::*;

View File

@@ -0,0 +1,598 @@
use std::collections::BTreeSet;
use std::path::PathBuf;
use std::pin::Pin;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use crate::*;
pub use ::cxx::CxxString;
fn create(path: &str, create: bool, prefix_len: usize, error: Pin<&mut CxxString>) -> Box<Engine> {
let config = EngineConfig {
path: PathBuf::from(path),
create,
prefix_len,
};
match Engine::open(&config) {
Ok(engine) => Box::new(engine),
Err(e) => {
error.push_str(&e.to_string());
unsafe { Box::from_raw(std::ptr::null_mut()) }
}
}
}
#[allow(dead_code)]
struct LogGuard(tracing_appender::non_blocking::WorkerGuard);
fn init_log(path: &str, error: Pin<&mut CxxString>) -> Box<LogGuard> {
match rolling_file::BasicRollingFileAppender::new(
path,
rolling_file::RollingConditionBasic::new().max_size(Size::mebibyte(500).into()),
20,
) {
Ok(file_appender) => {
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
tracing_subscriber::fmt()
.with_max_level(tracing::Level::INFO)
.with_writer(non_blocking)
.with_ansi(false)
.init();
Box::new(LogGuard(guard))
}
Err(e) => {
error.push_str(&e.to_string());
unsafe { Box::from_raw(std::ptr::null_mut()) }
}
}
}
impl Chunk {
fn raw_meta(&self) -> &ffi::RawMeta {
unsafe { std::mem::transmute(self.meta()) }
}
fn raw_etag(&self) -> &[u8] {
&self.meta().etag
}
fn uncommitted(&self) -> bool {
self.meta().uncommitted
}
}
impl WritingChunk {
fn raw_meta(&self) -> &ffi::RawMeta {
self.chunk.raw_meta()
}
fn raw_etag(&self) -> &[u8] {
self.chunk.raw_etag()
}
fn uncommitted(&self) -> bool {
self.chunk.uncommitted()
}
fn raw_chunk(&self) -> *const Chunk {
&self.chunk
}
fn set_chain_ver(&mut self, chain_ver: u32) {
self.chunk.set_chain_ver(chain_ver);
}
}
impl Engine {
fn raw_used_size(&self) -> ffi::RawUsedSize {
unsafe { std::mem::transmute(self.used_size()) }
}
fn get_raw_chunk(&self, chunk_id: &[u8], error: Pin<&mut CxxString>) -> *const Chunk {
match self.get(chunk_id) {
Ok(None) => {
error.clear();
std::ptr::null()
}
Ok(Some(c)) => {
error.clear();
Arc::into_raw(c)
}
Err(e) => {
error.push_str(&e.to_string());
std::ptr::null()
}
}
}
fn get_raw_chunks(&self, reqs: &mut [GetReq], error: Pin<&mut CxxString>) {
let chunk_ids = reqs
.iter()
.map(|r| Bytes::from(r.chunk_id))
.collect::<BTreeSet<_>>();
match self.batch_get(&chunk_ids) {
Ok(chunks) => {
for req in reqs {
match chunks.get(req.chunk_id) {
Some(c) => req.chunk_ptr = Arc::into_raw(c.clone()),
None => req.chunk_ptr = std::ptr::null_mut(),
}
}
error.clear();
}
Err(e) => {
error.push_str(&e.to_string());
}
}
}
unsafe fn release_raw_chunk(&self, chunk: *const Chunk) {
if !chunk.is_null() {
Arc::from_raw(chunk);
}
}
unsafe fn release_writing_chunk(&self, chunk: *mut WritingChunk) {
if !chunk.is_null() {
let _ = Box::from_raw(chunk);
}
}
fn update_raw_chunk(
&self,
chunk_id: &[u8],
mut req: Pin<&mut ffi::UpdateReq>,
error: Pin<&mut CxxString>,
) -> *mut WritingChunk {
match self.update_chunk(chunk_id, &mut req) {
Ok(chunk) => Box::into_raw(Box::new(chunk)),
Err(e) => {
error.push_str(&e.to_string());
req.out_error_code = match e {
Error::IoError(_) => 4011, // ChunkWriteFailed
Error::RocksDBError(_) => 4003, // ChunkMetadataSetError
Error::MetaError(_) => 4002, // ChunkMetadataGetError
Error::InvalidArg(_) => 3, // InvalidArg
Error::SerializationError(_) => 4002, // ChunkMetadataGetError
Error::ChecksumMismatch(_) => 4080, // ChecksumMismatch
Error::ChainVersionMismatch(_) => 4081, // ChainVersionMismatch
Error::ChunkETagMismatch(_) => 4083, // ChunkETagMismatch
Error::ChunkAlreadyExists => 4084, // ChunkAlreadyExists
Error::ChunkCommittedUpdate(_) => 4008, // ChunkCommittedUpdate
Error::ChunkMissingUpdate(_) => 4007, // ChunkMissingUpdate
Error::NoSpace => 7021, // NoSpace
};
std::ptr::null_mut()
}
}
}
unsafe fn commit_raw_chunk(
&self,
new_chunk: *mut WritingChunk,
sync: bool,
error: Pin<&mut CxxString>,
) {
let new_chunk = Box::from_raw(new_chunk);
match self.commit_chunk(*new_chunk, sync) {
Ok(_) => (),
Err(e) => error.push_str(&e.to_string()),
}
}
unsafe fn commit_raw_chunks(
&self,
reqs: &[*mut WritingChunk],
sync: bool,
error: Pin<&mut CxxString>,
) {
let chunks = reqs.iter().map(|c| *Box::from_raw(*c)).collect::<Vec<_>>();
match self.commit_chunks(chunks, sync) {
Ok(_) => (),
Err(e) => error.push_str(&e.to_string()),
}
}
fn query_raw_chunks(
&self,
begin: &[u8],
end: &[u8],
max_count: u64,
error: Pin<&mut CxxString>,
) -> Box<RawChunks> {
match self.query_chunks(begin, end, max_count) {
Ok(vec) => Box::new(RawChunks { vec }),
Err(e) => {
error.push_str(&e.to_string());
Default::default()
}
}
}
fn query_all_raw_chunks(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> Box<RawChunks> {
match self.query_all_chunks(prefix) {
Ok(vec) => Box::new(RawChunks { vec }),
Err(e) => {
error.push_str(&e.to_string());
Default::default()
}
}
}
fn query_raw_chunks_by_timestamp(
&self,
prefix: &[u8],
begin: u64,
end: u64,
max_count: u64,
error: Pin<&mut CxxString>,
) -> Box<RawChunks> {
match self.query_chunks_by_timestamp(prefix, begin, end, max_count) {
Ok(vec) => Box::new(RawChunks { vec }),
Err(e) => {
error.push_str(&e.to_string());
Default::default()
}
}
}
fn raw_batch_remove(
&self,
begin: &[u8],
end: &[u8],
max_count: u64,
error: Pin<&mut CxxString>,
) -> u64 {
match self.batch_remove(begin, end, max_count) {
Ok(cnt) => cnt,
Err(e) => {
error.push_str(&e.to_string());
0
}
}
}
fn query_raw_used_size(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> u64 {
match self.meta_store.query_used_size(prefix) {
Ok(size) => size,
Err(e) => {
error.push_str(&e.to_string());
0
}
}
}
fn get_metrics(&self) -> ffi::Metrics {
let metrics = self.metrics.as_ref();
let copy_on_write_times = metrics.copy_on_write_times.swap(0, Ordering::AcqRel);
let copy_on_write_latency = metrics.copy_on_write_latency.swap(0, Ordering::AcqRel);
let copy_on_write_read_times = metrics.copy_on_write_read_times.swap(0, Ordering::AcqRel);
let copy_on_write_read_latency =
metrics.copy_on_write_read_latency.swap(0, Ordering::AcqRel);
let allocate_total_latency = metrics.allocate_latency.swap(0, Ordering::AcqRel);
let allocate_total_times = metrics.allocate_times.swap(0, Ordering::AcqRel);
let pwrite_total_latency = metrics.pwrite_latency.swap(0, Ordering::AcqRel);
let pwrite_total_times = metrics.pwrite_times.swap(0, Ordering::AcqRel);
ffi::Metrics {
copy_on_write_times,
copy_on_write_latency: copy_on_write_latency / std::cmp::max(1, copy_on_write_times),
copy_on_write_read_bytes: metrics.copy_on_write_read_bytes.swap(0, Ordering::AcqRel),
copy_on_write_read_times,
copy_on_write_read_latency: copy_on_write_read_latency
/ std::cmp::max(1, copy_on_write_read_times),
checksum_reuse: metrics.checksum_reuse.swap(0, Ordering::AcqRel),
checksum_combine: metrics.checksum_combine.swap(0, Ordering::AcqRel),
checksum_recalculate: metrics.checksum_recalculate.swap(0, Ordering::AcqRel),
safe_write_direct_append: metrics.safe_write_direct_append.swap(0, Ordering::AcqRel),
safe_write_indirect_append: metrics
.safe_write_indirect_append
.swap(0, Ordering::AcqRel),
safe_write_truncate_shorten: metrics
.safe_write_truncate_shorten
.swap(0, Ordering::AcqRel),
safe_write_truncate_extend: metrics
.safe_write_truncate_extend
.swap(0, Ordering::AcqRel),
safe_write_read_tail_times: metrics
.safe_write_read_tail_times
.swap(0, Ordering::AcqRel),
safe_write_read_tail_bytes: metrics
.safe_write_read_tail_bytes
.swap(0, Ordering::AcqRel),
allocate_latency: allocate_total_latency / std::cmp::max(1, allocate_total_times),
allocate_times: allocate_total_times,
pwrite_latency: pwrite_total_latency / std::cmp::max(1, pwrite_total_times),
pwrite_times: pwrite_total_times,
}
}
fn query_uncommitted_raw_chunks(
&self,
prefix: &[u8],
error: Pin<&mut CxxString>,
) -> Box<RawChunks> {
match self.query_uncommitted_chunks(prefix) {
Ok(chunks) => Box::new(RawChunks { vec: chunks }),
Err(e) => {
error.push_str(&e.to_string());
Default::default()
}
}
}
fn handle_uncommitted_raw_chunks(
&self,
prefix: &[u8],
chain_ver: u32,
error: Pin<&mut CxxString>,
) -> Box<RawChunks> {
match self.handle_uncommitted_chunks(prefix, chain_ver) {
Ok(chunks) => Box::new(RawChunks { vec: chunks }),
Err(e) => {
error.push_str(&e.to_string());
Box::default()
}
}
}
}
#[derive(Default)]
struct RawChunks {
vec: Vec<(Bytes, ChunkMeta)>,
}
impl RawChunks {
fn len(&self) -> usize {
self.vec.len()
}
fn chunk_id(&self, pos: usize) -> &[u8] {
self.vec[pos].0.as_ref()
}
fn chunk_meta(&self, pos: usize) -> &ffi::RawMeta {
unsafe { std::mem::transmute(&self.vec[pos].1) }
}
fn chunk_etag(&self, pos: usize) -> &[u8] {
&self.vec[pos].1.etag
}
fn chunk_uncommitted(&self, pos: usize) -> bool {
self.vec[pos].1.uncommitted
}
}
#[::cxx::bridge(namespace = "hf3fs::chunk_engine")]
pub mod ffi {
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
struct UpdateReq {
without_checksum: bool,
is_truncate: bool,
is_remove: bool,
is_syncing: bool,
update_ver: u32,
chain_ver: u32,
checksum: u32,
length: u32,
offset: u32,
data: u64,
last_request_id: u64,
last_client_low: u64,
last_client_high: u64,
expected_tag: &'static [u8],
desired_tag: &'static [u8],
create_new: bool,
out_non_existent: bool,
out_error_code: u16,
out_commit_ver: u32,
out_chain_ver: u32,
out_checksum: u32,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
struct GetReq<'a> {
chunk_id: &'a [u8],
chunk_ptr: *const Chunk,
}
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
struct RawMeta {
pos: u64,
chain_ver: u32,
chunk_ver: u32,
len: u32,
checksum: u32,
timestamp: u64,
last_request_id: u64,
last_client_low: u64,
last_client_high: u64,
}
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
struct RawUsedSize {
allocated_size: u64,
reserved_size: u64,
position_count: u64,
position_rc: u64,
}
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
struct FdAndOffset {
fd: i32,
offset: u64,
}
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug)]
pub struct Metrics {
pub copy_on_write_times: u64,
pub copy_on_write_latency: u64,
pub copy_on_write_read_bytes: u64,
pub copy_on_write_read_times: u64,
pub copy_on_write_read_latency: u64,
pub checksum_reuse: u64,
pub checksum_combine: u64,
pub checksum_recalculate: u64,
pub safe_write_direct_append: u64,
pub safe_write_indirect_append: u64,
pub safe_write_truncate_shorten: u64,
pub safe_write_truncate_extend: u64,
pub safe_write_read_tail_times: u64,
pub safe_write_read_tail_bytes: u64,
pub allocate_times: u64,
pub allocate_latency: u64,
pub pwrite_times: u64,
pub pwrite_latency: u64,
}
extern "Rust" {
type Engine;
fn create(
path: &str,
create: bool,
prefix_len: usize,
error: Pin<&mut CxxString>,
) -> Box<Engine>;
fn raw_used_size(&self) -> RawUsedSize;
fn allocate_groups(&self, min_remain: usize, max_remain: usize, batch_size: usize)
-> usize;
fn allocate_ultra_groups(
&self,
min_remain: usize,
max_remain: usize,
batch_size: usize,
) -> usize;
fn compact_groups(&self, max_reserved: u64) -> usize;
fn set_allow_to_allocate(&self, val: bool);
fn speed_up_quit(&self);
fn get_raw_chunk(&self, chunk_id: &[u8], error: Pin<&mut CxxString>) -> *const Chunk;
fn get_raw_chunks(&self, reqs: &mut [GetReq], error: Pin<&mut CxxString>);
unsafe fn release_raw_chunk(&self, chunk: *const Chunk);
unsafe fn release_writing_chunk(&self, chunk: *mut WritingChunk);
fn update_raw_chunk(
&self,
chunk_id: &[u8],
req: Pin<&mut UpdateReq>,
error: Pin<&mut CxxString>,
) -> *mut WritingChunk;
unsafe fn commit_raw_chunk(
&self,
new_chunk: *mut WritingChunk,
sync: bool,
error: Pin<&mut CxxString>,
);
unsafe fn commit_raw_chunks(
&self,
reqs: &[*mut WritingChunk],
sync: bool,
error: Pin<&mut CxxString>,
);
fn query_raw_chunks(
&self,
begin: &[u8],
end: &[u8],
max_count: u64,
error: Pin<&mut CxxString>,
) -> Box<RawChunks>;
fn query_all_raw_chunks(&self, prefix: &[u8], error: Pin<&mut CxxString>)
-> Box<RawChunks>;
fn query_raw_chunks_by_timestamp(
&self,
prefix: &[u8],
begin: u64,
end: u64,
max_count: u64,
error: Pin<&mut CxxString>,
) -> Box<RawChunks>;
fn raw_batch_remove(
&self,
begin: &[u8],
end: &[u8],
max_count: u64,
error: Pin<&mut CxxString>,
) -> u64;
fn query_raw_used_size(&self, prefix: &[u8], error: Pin<&mut CxxString>) -> u64;
fn get_metrics(&self) -> Metrics;
fn query_uncommitted_raw_chunks(
&self,
prefix: &[u8],
error: Pin<&mut CxxString>,
) -> Box<RawChunks>;
fn handle_uncommitted_raw_chunks(
&self,
prefix: &[u8],
chain_ver: u32,
error: Pin<&mut CxxString>,
) -> Box<RawChunks>;
}
extern "Rust" {
type LogGuard;
fn init_log(path: &str, error: Pin<&mut CxxString>) -> Box<LogGuard>;
}
extern "Rust" {
type Chunk;
fn raw_meta(&self) -> &RawMeta;
fn raw_etag(&self) -> &[u8];
fn uncommitted(&self) -> bool;
fn fd_and_offset(&self) -> FdAndOffset;
}
extern "Rust" {
type WritingChunk;
fn raw_meta(&self) -> &RawMeta;
fn raw_etag(&self) -> &[u8];
fn uncommitted(&self) -> bool;
fn raw_chunk(&self) -> *const Chunk;
fn set_chain_ver(&mut self, chain_ver: u32);
}
extern "Rust" {
type RawChunks;
fn len(&self) -> usize;
fn chunk_id(&self, pos: usize) -> &[u8];
fn chunk_meta(&self, pos: usize) -> &RawMeta;
fn chunk_etag(&self, pos: usize) -> &[u8];
fn chunk_uncommitted(&self, pos: usize) -> bool;
}
}
static_assertions::const_assert_eq!(
std::mem::align_of::<ChunkMeta>(),
std::mem::align_of::<ffi::RawMeta>()
);
static_assertions::const_assert_eq!(
std::mem::size_of::<UsedSize>(),
std::mem::size_of::<ffi::RawUsedSize>()
);
static_assertions::const_assert_eq!(
std::mem::align_of::<UsedSize>(),
std::mem::align_of::<ffi::RawUsedSize>()
);
static_assertions::const_assert_eq!(
std::mem::size_of::<Metrics>(),
std::mem::size_of::<ffi::Metrics>()
);
static_assertions::const_assert_eq!(
std::mem::align_of::<Metrics>(),
std::mem::align_of::<ffi::Metrics>()
);

View File

@@ -0,0 +1,176 @@
use std::fs::File;
use std::os::fd::AsRawFd;
use std::os::unix::fs::FileExt;
use std::path::Path;
use std::{fs::OpenOptions, os::unix::fs::OpenOptionsExt};
use super::super::*;
const PUNCH_HOLE_FLAGS: i32 = libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE;
pub struct Cluster {
pub normal_fd: File,
pub direct_fd: File,
}
impl Cluster {
pub fn open(path: &Path, create: bool, support_direct_io: bool) -> Result<Self> {
let normal_fd = OpenOptions::new()
.read(true)
.write(true)
.create(create)
.custom_flags(libc::O_SYNC)
.open(path)
.map_err(|err| Error::IoError(format!("open {:?} failed: {:?}", path, err)))?;
let direct_fd = OpenOptions::new()
.read(true)
.write(true)
.custom_flags(if support_direct_io {
libc::O_DIRECT
} else {
libc::O_SYNC
})
.open(path)
.map_err(|err| Error::IoError(format!("open {:?} failed: {:?}", path, err)))?;
Ok(Self {
normal_fd,
direct_fd,
})
}
pub fn fallocate(&self, group_id: GroupId, punch_hole: bool) -> Result<()> {
let res = unsafe {
libc::fallocate(
self.direct_fd.as_raw_fd(),
if punch_hole { PUNCH_HOLE_FLAGS } else { 0 },
group_id.offset().into(),
group_id.size().into(),
)
};
if res == -1 {
Err(Error::IoError(format!(
"fallocate {} error: {:?}",
self.direct_fd.as_raw_fd(),
std::io::Error::last_os_error()
)))
} else {
Ok(())
}
}
pub fn pread(&self, pos: Position, mut buf: &mut [u8], offset: u32) -> Result<()> {
let aligned = is_aligned_io(buf, offset);
let mut offset = pos.offset() + offset;
while !buf.is_empty() {
let fd = if aligned && is_aligned_len(buf.len() as u32) {
&self.direct_fd
} else {
&self.normal_fd
};
match fd.read_at(buf, offset.into()) {
Ok(0) => return Err(Error::IoError(format!("read {:?} return 0", fd))),
Ok(n) => {
buf = &mut buf[n..];
offset += n;
}
Err(e) => Self::handle_error(e)?,
}
}
Ok(())
}
pub fn pwrite(&self, pos: Position, mut buf: &[u8], offset: u32) -> Result<()> {
let aligned = is_aligned_io(buf, offset);
let mut offset = pos.offset() + offset;
while !buf.is_empty() {
let fd = if aligned && is_aligned_len(buf.len() as u32) {
&self.direct_fd
} else {
&self.normal_fd
};
match fd.write_at(buf, offset.into()) {
Ok(0) => return Err(Error::IoError(format!("write {:?} return 0", fd))),
Ok(n) => {
buf = &buf[n..];
offset += n as u64;
}
Err(e) => Self::handle_error(e)?,
}
}
Ok(())
}
fn handle_error(e: std::io::Error) -> Result<()> {
if e.kind() == std::io::ErrorKind::Interrupted {
Ok(())
} else {
Err(Error::IoError(format!("io error: {:?}", e)))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::os::fd::FromRawFd;
#[test]
fn test_cluster_open() {
let dir = tempfile::tempdir().unwrap();
let support_direct_io = FsType::check(&dir).support_direct_io();
for chunk_size in [CHUNK_SIZE_NORMAL, CHUNK_SIZE_SMALL, CHUNK_SIZE_LARGE] {
let file_path = dir.path().join(format!("test.cluster.{}", chunk_size));
assert!(Cluster::open(&file_path, false, support_direct_io).is_err());
let cluster = Cluster::open(&file_path, true, support_direct_io).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), 0);
let cluster = Cluster::open(&file_path, false, support_direct_io).unwrap();
let group_id = GroupId::new(chunk_size, 0, 0);
let mut buf = [0u8; 5];
let pos = Position::new(group_id, 0);
assert!(cluster.pread(pos, &mut buf, 0).is_err());
cluster.fallocate(group_id, false).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size());
let bytes = "hello world!".as_bytes();
assert!(cluster.pwrite(pos, bytes, 0).is_ok());
assert!(cluster.pread(pos, &mut buf, 0).is_ok());
assert_eq!(&buf, &bytes[0..buf.len()]);
cluster.fallocate(group_id, true).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size());
}
assert!(Cluster::open(Path::new("/dev/null"), false, support_direct_io).is_err());
let cluster = Cluster {
normal_fd: File::open("/dev/null").unwrap(),
direct_fd: File::open("/dev/null").unwrap(),
};
assert!(cluster.fallocate(GroupId::default(), false).is_err());
assert!(cluster.fallocate(GroupId::default(), true).is_err());
assert!(cluster.pwrite(Position::from(0), &[1], 0).is_err());
let cluster = Cluster {
normal_fd: unsafe { File::from_raw_fd(23333) },
direct_fd: unsafe { File::from_raw_fd(23333) },
};
let mut buf = [0u8; 32];
assert!(cluster.pread(Position::from(0), &mut buf, 0).is_err());
std::mem::forget(cluster);
assert!(Cluster::handle_error(std::io::Error::from_raw_os_error(libc::EINTR)).is_ok());
}
}

View File

@@ -0,0 +1,118 @@
use super::super::*;
use std::{fmt::Debug, os::fd::AsRawFd, path::PathBuf};
#[derive(Debug, Default)]
pub struct ClustersConfig {
pub path: PathBuf,
pub chunk_size: Size,
pub create: bool,
}
pub struct Clusters {
pub path: PathBuf,
pub chunk_size: Size,
files: Vec<Cluster>,
}
impl Clusters {
const COUNT: u32 = 256;
pub fn open(config: &ClustersConfig) -> Result<Self> {
let mut files: Vec<Cluster> = vec![];
if config.create {
std::fs::create_dir_all(&config.path)
.map_err(|e| Error::IoError(format!("create dir {:?} fail: {e:?}", config.path)))?;
}
let support_direct_io = FsType::check(&config.path).support_direct_io();
for cluster_id in 0..Self::COUNT {
let file_path = config.path.join(format!("{:02X}", cluster_id));
files.push(Cluster::open(&file_path, config.create, support_direct_io)?);
}
Ok(Clusters {
path: config.path.clone(),
chunk_size: config.chunk_size,
files,
})
}
pub fn allocate(&self, group_id: GroupId) -> Result<()> {
self.files[group_id.cluster() as usize].fallocate(group_id, false)
}
pub fn deallocate(&self, group_id: GroupId) -> Result<()> {
self.files[group_id.cluster() as usize].fallocate(group_id, true)
}
pub fn pread(&self, pos: Position, buf: &mut [u8], offset: u32) -> Result<()> {
self.files[pos.cluster() as usize].pread(pos, buf, offset)
}
pub fn pwrite(&self, pos: Position, buf: &[u8], offset: u32) -> Result<()> {
self.files[pos.cluster() as usize].pwrite(pos, buf, offset)
}
pub fn fd_and_offset(&self, pos: Position) -> FdAndOffset {
FdAndOffset {
fd: self.files[pos.cluster() as usize].direct_fd.as_raw_fd(),
offset: pos.offset().into(),
}
}
}
impl Debug for Clusters {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Clusters")
.field("path", &self.path)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clusters() {
let dir = tempfile::tempdir().unwrap();
let config = ClustersConfig {
path: dir.path().into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
let clusters = Clusters::open(&config).unwrap();
let group_id = GroupId::new(CHUNK_SIZE_NORMAL, 0, 0);
let cluster = &clusters.files[0];
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), 0);
clusters.allocate(group_id).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size());
let group_id_3 = GroupId::new(CHUNK_SIZE_NORMAL, 0, 3);
clusters.allocate(group_id_3).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size() * 4);
clusters.deallocate(group_id).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size() * 4);
clusters.deallocate(group_id_3).unwrap();
let meta = cluster.normal_fd.metadata().unwrap();
assert_eq!(meta.len(), group_id.size() * 4);
let config = ClustersConfig {
path: std::path::Path::new("/proc/test").into(),
chunk_size: CHUNK_SIZE_NORMAL,
create: true,
};
assert!(Clusters::open(&config).is_err());
}
}

View File

@@ -0,0 +1,33 @@
use std::{ffi::CString, os::unix::ffi::OsStrExt, path::Path};
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum FsType {
EXT4,
NFS,
XFS,
ZFS,
OTHER,
}
impl FsType {
pub fn check(path: impl AsRef<Path>) -> Self {
let path_cstr = CString::new(path.as_ref().as_os_str().as_bytes()).unwrap();
let mut stat: libc::statfs = unsafe { std::mem::zeroed() };
let result = unsafe { libc::statfs(path_cstr.as_ptr(), &mut stat) };
if result != 0 {
Self::OTHER
} else {
match stat.f_type {
libc::EXT4_SUPER_MAGIC => Self::EXT4,
libc::NFS_SUPER_MAGIC => Self::NFS,
libc::XFS_SUPER_MAGIC => Self::XFS,
0x2FC12FC1 => Self::ZFS, // https://github.com/openzfs/zfs/blob/33174af15112ed5c53299da2d28e763b0163f428/include/sys/fs/zfs.h#L1339
_ => Self::OTHER,
}
}
}
pub fn support_direct_io(&self) -> bool {
!matches!(self, FsType::ZFS)
}
}

View File

@@ -0,0 +1,7 @@
mod cluster;
mod clusters;
mod fs_type;
pub use cluster::*;
pub use clusters::*;
pub use fs_type::*;

View File

@@ -0,0 +1,18 @@
mod alloc;
mod core;
mod cxx;
mod file;
mod meta;
mod types;
mod utils;
pub use alloc::*;
pub use core::*;
pub use cxx::{
ffi::{FdAndOffset, GetReq, UpdateReq},
CxxString,
};
pub use file::*;
pub use meta::*;
pub use types::*;
pub use utils::*;

View File

@@ -0,0 +1,217 @@
use super::super::{Bytes, Error, GroupId, Position, Result};
use byteorder::{BigEndian, ByteOrder};
pub struct MetaKey(Bytes);
impl MetaKey {
pub const CHUNK_META_KEY_PREFIX: u8 = 1;
pub const GROUP_BITS_KEY_PREFIX: u8 = 2;
pub const POS_TO_CHUNK_KEY_PREFIX: u8 = 3;
pub const USED_SIZE_KEY_PREFIX: u8 = 4;
pub const USED_SIZE_PREFIX_LEN_KEY: u8 = 5;
pub const TIMESTAMP_KEY_PREFIX: u8 = 6;
// pub const WRITING_CHUNK_KEY_PREFIX: u8 = 7;
pub const VERSION_KEY: u8 = 8;
pub const WRITING_CHUNK_KEY_PREFIX: u8 = 9;
pub const TEST_KEY_PREFIX: u8 = b'm';
fn prefix(mark: u8) -> Self {
let mut vec = Bytes::new();
vec.push(mark);
Self(vec)
}
pub fn chunk_meta_key_prefix() -> Self {
Self::prefix(Self::CHUNK_META_KEY_PREFIX)
}
pub fn chunk_meta_key(chunk_id: &[u8]) -> Self {
let mut out = Self::chunk_meta_key_prefix();
for num in chunk_id {
out.0.push(!num)
}
out
}
pub fn parse_chunk_meta_key(key: &[u8]) -> Bytes {
let mut out = Bytes::new();
for num in &key[1..] {
out.push(!num);
}
out
}
pub fn group_bits_key_prefix() -> Self {
Self::prefix(Self::GROUP_BITS_KEY_PREFIX)
}
pub fn group_bits_chunk_size_prefix(group_id: GroupId) -> Self {
let mut out = Self::group_bits_key_prefix();
out.0.extend_from_slice(&group_id.to_be_bytes()[..4]);
out
}
pub fn group_bits_key(group_id: GroupId) -> Self {
let mut out = Self::group_bits_key_prefix();
out.0.extend_from_slice(&group_id.to_be_bytes());
out
}
pub fn parse_group_bits_key(key: &[u8]) -> Result<GroupId> {
if key.len() == std::mem::size_of::<u8>() + std::mem::size_of::<u64>() {
let group_id = BigEndian::read_u64(&key[1..]);
Ok(GroupId::from(group_id))
} else {
Err(Error::MetaError(format!(
"parse group bits key fail: {:?}",
key
)))
}
}
pub fn pos_to_chunk_key_prefix() -> Self {
Self::prefix(Self::POS_TO_CHUNK_KEY_PREFIX)
}
pub fn group_to_chunks_key_prefix(group_id: GroupId) -> Self {
let mut out = Self::pos_to_chunk_key_prefix();
out.0
.extend_from_slice(&Position::new(group_id, 0).to_be_bytes());
out.0.pop();
out
}
pub fn pos_to_chunk_key(pos: Position) -> Self {
let mut out = Self::pos_to_chunk_key_prefix();
out.0.extend_from_slice(&pos.to_be_bytes());
out
}
pub fn parse_pos_to_chunk_key(key: &[u8]) -> Result<Position> {
if key.len() == std::mem::size_of::<u8>() + std::mem::size_of::<u64>() {
Ok(Position::from(BigEndian::read_u64(&key[1..])))
} else {
Err(Error::MetaError(format!(
"parse pos to chunk key fail: {:?}",
key
)))
}
}
pub fn used_size_key_prefix() -> Self {
Self::prefix(Self::USED_SIZE_KEY_PREFIX)
}
pub fn used_size_key(prefix: &[u8]) -> Self {
let mut out = Self::used_size_key_prefix();
out.0.extend_from_slice(prefix);
out
}
pub fn used_size_prefix_len_key() -> Self {
Self::prefix(Self::USED_SIZE_PREFIX_LEN_KEY)
}
pub fn timestamp_key_prefix() -> Self {
Self::prefix(Self::TIMESTAMP_KEY_PREFIX)
}
pub fn timestamp_key_filter(prefix: &[u8], timestamp: u64) -> Self {
let mut out = Self::timestamp_key_prefix();
out.0.extend_from_slice(prefix);
out.0.extend_from_slice(&timestamp.to_be_bytes());
out
}
pub fn timestamp_key(timestamp: u64, chunk_id: &[u8], prefix_len: usize) -> Self {
let mut out = Self::timestamp_key_filter(&chunk_id[..prefix_len], timestamp);
out.0.extend_from_slice(&chunk_id[prefix_len..]);
out
}
pub fn parse_timestamp_key(key: &[u8], prefix_len: usize) -> Result<(u64, Bytes)> {
const L: usize = std::mem::size_of::<u8>() + std::mem::size_of::<u64>();
if key.len() > L + prefix_len {
let mut chunk_id = Bytes::from(&key[1..1 + prefix_len]);
let timestamp = BigEndian::read_u64(&key[1 + prefix_len..]);
chunk_id.extend_from_slice(&key[L + prefix_len..]);
Ok((timestamp, chunk_id))
} else {
Err(Error::MetaError(format!(
"parse timestamp key fail: {:?}",
key
)))
}
}
pub fn version_key() -> Self {
Self::prefix(Self::VERSION_KEY)
}
pub fn writing_chunk_key_prefix() -> Self {
Self::prefix(Self::WRITING_CHUNK_KEY_PREFIX)
}
pub fn writing_chunk_key(chunk_id: &[u8]) -> Self {
let mut out = Self::writing_chunk_key_prefix();
out.0.extend_from_slice(chunk_id);
out
}
pub fn parse_writing_chunk_key(key: &[u8]) -> Result<Bytes> {
if key.len() > 1 {
Ok(Bytes::from(&key[1..]))
} else {
Err(Error::MetaError(format!(
"parse writing chunk key fail: {:?}",
key
)))
}
}
}
impl AsRef<[u8]> for MetaKey {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_meta_key_create() {
use super::super::super::*;
let prefix = MetaKey::chunk_meta_key_prefix();
assert_eq!(prefix.as_ref(), [MetaKey::CHUNK_META_KEY_PREFIX]);
let meta_key = MetaKey::chunk_meta_key(&[1, 2, 3, 4]);
assert_eq!(
meta_key.as_ref(),
[MetaKey::CHUNK_META_KEY_PREFIX, !1, !2, !3, !4]
);
let group_id = GroupId::new(CHUNK_SIZE_NORMAL, 1, 2);
let pos = Position::new(group_id, 3);
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(pos);
assert_eq!(pos_to_chunk_key.as_ref().len(), 1 + 8);
let parsed_pos = MetaKey::parse_pos_to_chunk_key(pos_to_chunk_key.as_ref()).unwrap();
assert_eq!(pos, parsed_pos);
let group_to_chunks_key_prefix = MetaKey::group_to_chunks_key_prefix(group_id);
assert_eq!(group_to_chunks_key_prefix.as_ref().len(), 8);
assert!(MetaKey::parse_group_bits_key(&[]).is_err());
assert!(MetaKey::parse_pos_to_chunk_key(&[]).is_err());
let timestamp_key = MetaKey::timestamp_key(1024, &[1, 2, 3, 4], 2);
let (timestamp, chunk) = MetaKey::parse_timestamp_key(&timestamp_key.0, 2).unwrap();
assert_eq!(timestamp, 1024);
assert_eq!(chunk, [1, 2, 3, 4].as_slice());
MetaKey::parse_timestamp_key(&[MetaKey::TIMESTAMP_KEY_PREFIX, 0, 1, 2, 3, 4, 5, 6, 7], 0)
.unwrap_err();
MetaKey::parse_writing_chunk_key(MetaKey::writing_chunk_key_prefix().as_ref()).unwrap_err();
}
}

View File

@@ -0,0 +1,152 @@
use byteorder::{ByteOrder, LittleEndian};
use derse::{DownwardBytes, Serialize};
use super::super::{GroupState, MergeState, MetaKey};
pub struct MetaMergeOp;
impl super::MergeOp for MetaMergeOp {
fn full_merge<'a>(
key: &[u8],
value: Option<&[u8]>,
operands: impl Iterator<Item = &'a [u8]>,
) -> Option<Vec<u8>> {
match key[0] {
MetaKey::GROUP_BITS_KEY_PREFIX => {
let mut merge_bits = MergeState::empty();
for op in operands {
merge_bits.merge(&MergeState::from(op).ok()?);
}
let mut bits = if let Some(group_bits) = value {
GroupState::from(group_bits).ok()?
} else {
GroupState::empty()
};
bits.update(&merge_bits);
Some(Vec::from(bits.as_bytes()))
}
MetaKey::USED_SIZE_KEY_PREFIX => {
let mut total = 0i64;
for op in operands {
if op.len() != std::mem::size_of_val(&total) {
return None;
}
total += LittleEndian::read_i64(op);
}
if let Some(value) = value {
if value.len() != std::mem::size_of_val(&total) {
return None;
}
total += LittleEndian::read_i64(value);
}
let mut vec = Vec::with_capacity(std::mem::size_of_val(&total));
vec.extend_from_slice(&total.to_le_bytes());
Some(vec)
}
MetaKey::TEST_KEY_PREFIX => {
let mut out = Vec::<u8>::new();
if let Some(value) = value {
out.extend_from_slice(value);
}
for op in operands {
out.extend_from_slice(op);
}
Some(out)
}
_ => None,
}
}
fn partial_merge<'a>(key: &[u8], operands: impl Iterator<Item = &'a [u8]>) -> Option<Vec<u8>> {
match key[0] {
MetaKey::GROUP_BITS_KEY_PREFIX => {
let mut merge_bits = MergeState::empty();
for op in operands {
merge_bits.merge(&MergeState::from(op).ok()?);
}
if let Ok(bytes) = merge_bits.serialize::<DownwardBytes>() {
Some(Vec::from(bytes.as_slice()))
} else {
None
}
}
MetaKey::USED_SIZE_KEY_PREFIX => {
let mut total = 0i64;
for op in operands {
if op.len() != std::mem::size_of_val(&total) {
return None;
}
total += LittleEndian::read_i64(op);
}
let mut vec = Vec::with_capacity(std::mem::size_of_val(&total));
vec.extend_from_slice(&total.to_le_bytes());
Some(vec)
}
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::*;
#[test]
fn test_meta_merge_op() {
let slice = [233u8].as_slice();
assert_eq!(
MetaMergeOp::partial_merge(&[233], vec![slice].into_iter()),
None
);
}
#[test]
fn test_used_size_merge() {
let mut ops = Vec::new();
for i in 0..10 {
let mut vec = Vec::with_capacity(std::mem::size_of::<i64>());
vec.extend_from_slice(&(i as i64).to_le_bytes());
ops.push(vec);
}
let merged = MetaMergeOp::partial_merge(
&[MetaKey::USED_SIZE_KEY_PREFIX],
ops.iter().map(|v| v.as_slice()),
)
.unwrap();
assert_eq!(LittleEndian::read_i64(&merged), (0..10).sum::<i64>());
// test full merge.
let mut ops = Vec::new();
for i in 0..10 {
let mut vec = Vec::with_capacity(std::mem::size_of::<i64>());
vec.extend_from_slice(&(i as i64).to_le_bytes());
ops.push(vec);
}
let value = 10i64;
let merged = MetaMergeOp::full_merge(
&[MetaKey::USED_SIZE_KEY_PREFIX],
Some(value.to_le_bytes().as_slice()),
ops.iter().map(|v| v.as_slice()),
)
.unwrap();
assert_eq!(
LittleEndian::read_i64(&merged),
(0..10).sum::<i64>() + value
);
// test invalid ops.
let invalid_ops = [vec![1, 2, 3]];
assert_eq!(
MetaMergeOp::partial_merge(
&[MetaKey::USED_SIZE_KEY_PREFIX],
invalid_ops.iter().map(|v| v.as_slice()),
),
None
);
}
}

View File

@@ -0,0 +1,873 @@
use std::{cell::RefCell, collections::HashMap, ops::DerefMut};
use super::super::*;
use byteorder::{ByteOrder, LittleEndian};
use derse::{Deserialize, DownwardBytes, Serialize};
#[derive(Debug, Default, Clone)]
pub struct MetaStoreConfig {
pub rocksdb: RocksDBConfig,
pub prefix_len: usize,
}
pub struct MetaStore {
rocksdb: RocksDB,
config: MetaStoreConfig,
}
impl MetaStore {
thread_local! {
static BYTES: RefCell<DownwardBytes> = RefCell::new(DownwardBytes::with_capacity(Size::MB.into()));
}
pub fn open(config: &MetaStoreConfig) -> Result<Self> {
let rocksdb = RocksDB::open::<MetaMergeOp>(&config.rocksdb)?;
let mut this = MetaStore {
rocksdb,
config: config.clone(),
};
this.update_used_size_if_need()?;
Ok(this)
}
pub fn get_chunk_meta(&self, chunk_id: &[u8]) -> Result<Option<ChunkMeta>> {
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
let value = self.rocksdb.get(chunk_meta_key)?;
if let Some(value) = value {
Ok(Some(
ChunkMeta::deserialize(value.as_ref()).map_err(Error::SerializationError)?,
))
} else {
Ok(None)
}
}
pub fn query_chunks(
&self,
begin: impl AsRef<[u8]>,
end: impl AsRef<[u8]>,
max_count: u64,
) -> Result<Vec<(Bytes, ChunkMeta)>> {
let it = self.iterator();
self.query_chunks_from_iterator(it, begin, end, max_count)
}
pub fn query_chunks_from_iterator(
&self,
mut it: RocksDBIterator,
begin: impl AsRef<[u8]>,
end: impl AsRef<[u8]>,
max_count: u64,
) -> Result<Vec<(Bytes, ChunkMeta)>> {
let mut out = Vec::<(Bytes, ChunkMeta)>::with_capacity(4096);
let end_key = MetaKey::chunk_meta_key(end.as_ref());
it.seek(&end_key)?;
if it.key() == Some(end_key.as_ref()) {
it.next(); // [begin, end)
}
for _ in 0..max_count {
if !it.valid() {
break;
}
if it.key().unwrap()[0] != MetaKey::CHUNK_META_KEY_PREFIX {
break;
}
let chunk_id = MetaKey::parse_chunk_meta_key(it.key().unwrap());
if begin.as_ref() <= chunk_id.as_ref() {
let chunk_meta = ChunkMeta::deserialize(it.value().unwrap())
.map_err(Error::SerializationError)?;
out.push((chunk_id, chunk_meta))
} else {
break;
}
it.next();
}
Ok(out)
}
pub fn query_chunks_by_timestamp(
&self,
prefix: &[u8],
begin: u64,
end: u64,
max_count: u64,
) -> Result<Vec<Bytes>> {
let mut it = self.iterator();
let mut out = Vec::<Bytes>::with_capacity(4096);
let begin_key = MetaKey::timestamp_key_filter(prefix, begin);
it.seek(&begin_key)?;
for _ in 0..max_count {
if !it.valid() {
break;
}
let key = it.key().unwrap();
if key[0] != MetaKey::TIMESTAMP_KEY_PREFIX {
break;
}
if key.len() <= prefix.len() || &key[1..1 + self.config.prefix_len] != prefix {
break;
}
let (timestamp, chunk_id) = MetaKey::parse_timestamp_key(key, self.config.prefix_len)?;
if timestamp < end {
out.push(chunk_id)
} else {
break;
}
it.next();
}
Ok(out)
}
#[inline(always)]
pub fn write(&self, write_batch: rocksdb::WriteBatch, sync: bool) -> Result<()> {
self.rocksdb.write(write_batch, sync)
}
pub fn add_chunk(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta, sync: bool) -> Result<()> {
let mut write_batch = RocksDB::new_write_batch();
self.add_chunk_mut(chunk_id, chunk_meta, &mut write_batch)?;
self.write(write_batch, sync)
}
pub fn add_chunk_mut(
&self,
chunk_id: &[u8],
chunk_meta: &ChunkMeta,
write_batch: &mut rocksdb::WriteBatch,
) -> Result<()> {
// 1. add chunk meta.
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
Self::with_tls_bytes(|bytes| {
chunk_meta
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.put(chunk_meta_key, &bytes[..]);
Ok(())
})?;
// 2. add pos->chunk map.
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
write_batch.put(pos_to_chunk_key, chunk_id);
// 3. update group bits.
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::acquire(chunk_meta.pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
// 4. update used size.
self.update_used_size(chunk_id, chunk_meta.pos.chunk_size().0 as i64, write_batch)?;
// 5. add timestamp->chunk map.
let timestamp_key =
MetaKey::timestamp_key(chunk_meta.timestamp, chunk_id, self.config.prefix_len);
write_batch.put(timestamp_key, chunk_id);
// 6. remove writing chunk log.
self.remove_writing_chunk_mut(chunk_id, write_batch);
Ok(())
}
pub fn move_chunk(
&self,
chunk_id: &[u8],
old_meta: &ChunkMeta,
new_meta: &ChunkMeta,
sync: bool,
) -> Result<()> {
let mut write_batch = RocksDB::new_write_batch();
self.move_chunk_mut(chunk_id, old_meta, new_meta, &mut write_batch)?;
self.write(write_batch, sync)
}
pub fn move_chunk_mut(
&self,
chunk_id: &[u8],
old_meta: &ChunkMeta,
new_meta: &ChunkMeta,
write_batch: &mut rocksdb::WriteBatch,
) -> Result<()> {
// 1. change chunk meta.
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
Self::with_tls_bytes(|bytes| {
new_meta
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.put(chunk_meta_key, bytes.as_slice());
Ok(())
})?;
if old_meta.pos != new_meta.pos {
// 2. remove old pos->chunk map.
let old_pos = old_meta.pos;
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(old_pos);
write_batch.delete(pos_to_chunk_key);
let group_bits_key = MetaKey::group_bits_key(old_pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::release(old_pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
// 3. add new pos->chunk map.
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(new_meta.pos);
write_batch.put(pos_to_chunk_key, chunk_id);
let group_bits_key = MetaKey::group_bits_key(new_meta.pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::acquire(new_meta.pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
// 4. update used size.
self.update_used_size(
chunk_id,
new_meta.pos.chunk_size().0 as i64 - old_pos.chunk_size().0 as i64,
write_batch,
)?;
}
// 5. update timestamp->chunk map.
self.check_chunk_id(chunk_id)?;
let timestamp_key =
MetaKey::timestamp_key(new_meta.timestamp, chunk_id, self.config.prefix_len);
write_batch.put(timestamp_key, []);
let timestamp_key =
MetaKey::timestamp_key(old_meta.timestamp, chunk_id, self.config.prefix_len);
write_batch.delete(timestamp_key);
// 6. remove writing chunk log.
self.remove_writing_chunk_mut(chunk_id, write_batch);
Ok(())
}
pub fn remove(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta, sync: bool) -> Result<()> {
let mut write_batch = RocksDB::new_write_batch();
self.remove_mut(chunk_id, chunk_meta, &mut write_batch)?;
self.write(write_batch, sync)
}
pub fn remove_mut(
&self,
chunk_id: &[u8],
chunk_meta: &ChunkMeta,
write_batch: &mut rocksdb::WriteBatch,
) -> Result<()> {
// 1. delete chunk meta.
let chunk_meta_key = MetaKey::chunk_meta_key(chunk_id);
write_batch.delete(chunk_meta_key);
// 2. delete pos->chunk map.
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
write_batch.delete(pos_to_chunk_key);
// 3. release position.
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::release(chunk_meta.pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
// 4. update used size.
self.update_used_size(
chunk_id,
-(chunk_meta.pos.chunk_size().0 as i64),
write_batch,
)?;
// 5. delete timestamp->chunk map.
let timestamp_key =
MetaKey::timestamp_key(chunk_meta.timestamp, chunk_id, self.config.prefix_len);
write_batch.delete(timestamp_key);
// 6. remove writing chunk log.
self.remove_writing_chunk_mut(chunk_id, write_batch);
Ok(())
}
pub fn allocate_group(&self, group_id: GroupId) -> Result<()> {
let group_bits_key = MetaKey::group_bits_key(group_id);
self.rocksdb
.put(group_bits_key, GroupState::empty().as_bytes(), true)
}
pub fn remove_group(&self, group_id: GroupId) -> Result<()> {
let group_bits_key = MetaKey::group_bits_key(group_id);
self.rocksdb.delete(group_bits_key, true)
}
pub fn iterator(&self) -> RocksDBIterator {
self.rocksdb.new_iterator()
}
fn update_used_size(
&self,
chunk_id: &[u8],
diff: i64,
write_batch: &mut rocksdb::WriteBatch,
) -> Result<()> {
self.check_chunk_id(chunk_id)?;
let used_size_key = MetaKey::used_size_key(&chunk_id[..self.config.prefix_len]);
write_batch.merge(used_size_key, diff.to_le_bytes());
Ok(())
}
pub fn persist_writing_chunk(&self, chunk_id: &[u8], chunk_meta: &ChunkMeta) -> Result<()> {
let chunk_meta_key = MetaKey::writing_chunk_key(chunk_id);
Self::with_tls_bytes(|bytes| {
chunk_meta
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
self.rocksdb.put(chunk_meta_key, &bytes[..], true)
})
}
pub fn remove_writing_chunk_mut(&self, chunk_id: &[u8], write_batch: &mut rocksdb::WriteBatch) {
write_batch.delete(MetaKey::writing_chunk_key(chunk_id));
}
pub fn occupy_uncommitted_positions(&mut self) -> Result<Vec<(Bytes, ChunkMeta, bool)>> {
let mut prefix_len = 0;
std::mem::swap(&mut self.config.prefix_len, &mut prefix_len);
let list = self.query_uncommitted_chunks(&[])?;
std::mem::swap(&mut self.config.prefix_len, &mut prefix_len);
let mut uncommitted_chunks = vec![];
let mut write_batch = RocksDB::new_write_batch();
let mut count = 0;
for (chunk_id, writing_meta) in list {
let pos = writing_meta.pos;
match self.get_chunk_meta(&chunk_id)? {
Some(meta) if meta.pos == writing_meta.pos => {
uncommitted_chunks.push((chunk_id, writing_meta, false));
}
_ => {
uncommitted_chunks.push((chunk_id.clone(), writing_meta, true));
count += 1;
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(pos);
write_batch.put(pos_to_chunk_key, chunk_id);
let group_bits_key = MetaKey::group_bits_key(pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::acquire(pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
}
}
}
if !uncommitted_chunks.is_empty() {
self.write(write_batch, true)?;
tracing::info!("occupy {} positions for writing chunks", count);
}
Ok(uncommitted_chunks)
}
pub fn vacate_uncommitted_positions(
&self,
uncommitted_chunks: Vec<(Bytes, ChunkMeta, bool)>,
) -> Result<()> {
let mut write_batch = RocksDB::new_write_batch();
let mut count = 0;
for (_, chunk_meta, occupied) in uncommitted_chunks {
if !occupied {
continue;
}
count += 1;
let pos_to_chunk_key = MetaKey::pos_to_chunk_key(chunk_meta.pos);
write_batch.delete(pos_to_chunk_key);
let group_bits_key = MetaKey::group_bits_key(chunk_meta.pos.group_id());
Self::with_tls_bytes(|bytes| {
MergeState::release(chunk_meta.pos.index())
.serialize_to(bytes)
.map_err(Error::SerializationError)?;
write_batch.merge(group_bits_key, &bytes[..]);
Ok(())
})?;
}
self.write(write_batch, true)?;
tracing::info!("vacate {} positions for writing chunks", count);
Ok(())
}
fn query_uncommitted_chunks(&self, prefix: &[u8]) -> Result<Vec<(Bytes, ChunkMeta)>> {
self.check_prefix(prefix)?;
let mut it = self.iterator();
let mut out = Vec::<(Bytes, ChunkMeta)>::with_capacity(4096);
let end_key = MetaKey::writing_chunk_key(prefix);
it.seek(&end_key)?;
if it.key() == Some(end_key.as_ref()) {
it.next(); // [begin, end)
}
loop {
if !it.valid() {
break;
}
if it.key().unwrap()[0] != MetaKey::WRITING_CHUNK_KEY_PREFIX {
break;
}
let chunk_id = MetaKey::parse_writing_chunk_key(it.key().unwrap())?;
if prefix <= chunk_id.as_ref() {
let chunk_meta = ChunkMeta::deserialize(it.value().unwrap())
.map_err(Error::SerializationError)?;
out.push((chunk_id, chunk_meta))
} else {
break;
}
it.next();
}
Ok(out)
}
fn check_chunk_id(&self, chunk_id: &[u8]) -> Result<()> {
let prefix_len = self.config.prefix_len;
if chunk_id.len() < prefix_len {
return Err(Error::InvalidArg(format!(
"chunk_id.len() < prefix len: {:?}, {}",
chunk_id, prefix_len
)));
}
Ok(())
}
fn check_prefix(&self, prefix: &[u8]) -> Result<()> {
let prefix_len = self.config.prefix_len;
if prefix.len() != prefix_len {
return Err(Error::InvalidArg(format!(
"prefix.len() != prefix len: {:?}, {}",
prefix, prefix_len
)));
}
Ok(())
}
pub fn query_used_size(&self, prefix: &[u8]) -> Result<u64> {
self.check_prefix(prefix)?;
let used_size_key = MetaKey::used_size_key(prefix);
let value = self.rocksdb.get(used_size_key)?;
if let Some(size) = value {
if size.len() != std::mem::size_of::<u64>() {
Err(Error::InvalidArg(format!(
"invalid size length: {:?}",
size.as_ref()
)))
} else {
Ok(LittleEndian::read_u64(size.as_ref()))
}
} else {
Ok(0)
}
}
fn with_tls_bytes<F, R>(f: F) -> R
where
F: FnOnce(&mut DownwardBytes) -> R,
{
Self::BYTES.with(|v| {
let mut bytes = v.borrow_mut();
let result = f(bytes.deref_mut());
bytes.clear_and_shrink_to(Size::MB.into());
result
})
}
fn update_used_size_if_need(&mut self) -> Result<()> {
let old_len = match self.rocksdb.get(MetaKey::used_size_prefix_len_key())? {
Some(value) => {
if value.len() != std::mem::size_of::<u32>() {
return Err(Error::InvalidArg(format!(
"invalid used size prefix length: {:?}",
value.as_ref()
)));
}
LittleEndian::read_u32(value.as_ref()) as usize
}
None => 0,
};
let prefix_len = self.config.prefix_len;
if old_len == prefix_len {
return Ok(());
}
let mut map = HashMap::<Bytes, u64>::new();
if prefix_len == 0 {
map.insert(Bytes::new(), 0);
}
let mut it = self.iterator();
it.iterate(MetaKey::chunk_meta_key_prefix(), |key, value| {
let mut chunk_id = MetaKey::parse_chunk_meta_key(key);
chunk_id.resize(prefix_len, 0);
let chunk_meta = ChunkMeta::deserialize(value).map_err(Error::SerializationError)?;
let chunk_size = chunk_meta.pos.chunk_size().0;
map.entry(chunk_id)
.and_modify(|v| *v += chunk_size)
.or_insert(chunk_size);
Ok(())
})?;
let mut write_batch = RocksDB::new_write_batch();
write_batch.put(
MetaKey::used_size_prefix_len_key(),
(prefix_len as u32).to_le_bytes(),
);
for (prefix, size) in map {
write_batch.put(MetaKey::used_size_key(&prefix), size.to_le_bytes())
}
self.write(write_batch, true)
}
pub const V1_FIX_TIMESTAMP: u8 = 1;
pub const LATEST_VERSION: u8 = Self::V1_FIX_TIMESTAMP;
pub fn get_version(&self) -> Result<u8> {
match self.rocksdb.get(MetaKey::version_key())? {
Some(value) if !value.is_empty() => Ok(value[0]),
Some(value) => Err(Error::InvalidArg(format!(
"invalid version: {:?}",
value.as_ref()
))),
None => Ok(0),
}
}
pub fn set_version(&self, version: u8) -> Result<()> {
self.rocksdb.put(MetaKey::version_key(), [version], true)
}
pub fn remove_range_mut(
&self,
prefix: u8,
write_batch: &mut rocksdb::WriteBatch,
) -> Result<()> {
if prefix == MetaKey::CHUNK_META_KEY_PREFIX
|| prefix == MetaKey::GROUP_BITS_KEY_PREFIX
|| prefix == MetaKey::POS_TO_CHUNK_KEY_PREFIX
|| prefix == MetaKey::USED_SIZE_KEY_PREFIX
|| prefix == MetaKey::USED_SIZE_PREFIX_LEN_KEY
{
return Err(Error::InvalidArg(format!(
"invalid remove range: {}",
prefix
)));
}
write_batch.delete_range(&[prefix], &[prefix + 1]);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_meta_store_normal() {
let dir = tempfile::tempdir().unwrap();
let config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = MetaStore::open(&config).unwrap();
let chunk_id = "1000".as_bytes();
let chunk_meta = meta_store.get_chunk_meta(chunk_id).unwrap();
assert!(chunk_meta.is_none());
let chunk_meta_in = ChunkMeta {
chunk_ver: 1,
..Default::default()
};
meta_store
.add_chunk(chunk_id, &chunk_meta_in, false)
.unwrap();
let chunk_id = "1000".as_bytes();
let chunk_meta_out = meta_store.get_chunk_meta(chunk_id).unwrap().unwrap();
assert_eq!(chunk_meta_in, chunk_meta_out);
assert_eq!(meta_store.query_chunks([], "100", 10).unwrap().len(), 1);
meta_store.remove(chunk_id, &chunk_meta_out, false).unwrap();
let mut write_batch = RocksDB::new_write_batch();
meta_store
.remove_range_mut(MetaKey::CHUNK_META_KEY_PREFIX, &mut write_batch)
.unwrap_err();
meta_store
.rocksdb
.put(MetaKey::version_key(), &[], false)
.unwrap();
meta_store.get_version().unwrap_err();
}
#[test]
fn test_meta_get_set() {
let dir = tempfile::tempdir().unwrap();
let config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
},
..Default::default()
};
let meta_store = MetaStore::open(&config).unwrap();
let group_id = GroupId::default();
let mut chunk_meta = ChunkMeta::default();
for i in 0..128u32 {
chunk_meta.pos = Position::new(group_id, 2 * i as u8);
meta_store
.add_chunk(&i.to_be_bytes(), &chunk_meta, false)
.unwrap();
}
let vec = meta_store
.query_chunks(10u32.to_be_bytes(), 20u32.to_be_bytes(), 30)
.unwrap();
assert_eq!(vec.len(), 10);
assert_eq!(vec.first().unwrap().0.as_ref(), &19u32.to_be_bytes());
assert_eq!(vec.last().unwrap().0.as_ref(), &10u32.to_be_bytes());
let vec = meta_store
.query_chunks(80u32.to_be_bytes(), 100u32.to_be_bytes(), 30)
.unwrap();
assert_eq!(vec.len(), 20);
let mut it = meta_store.iterator();
let mut count = 0;
it.iterate(MetaKey::group_bits_key_prefix(), |_key, value| {
count += 1;
let bits = GroupState::from(value)?;
assert_eq!(bits.count(), 128);
for i in 0..128 {
assert!(bits.check(i * 2));
assert!(!bits.check(i * 2 + 1));
}
Ok(())
})
.unwrap();
assert_eq!(count, 1);
for i in 0..128u32 {
chunk_meta.pos = Position::new(group_id, 1 + 2 * i as u8);
meta_store
.add_chunk(&i.to_be_bytes(), &chunk_meta, false)
.unwrap();
}
let mut it = meta_store.iterator();
let mut count = 0;
it.iterate(MetaKey::group_bits_key_prefix(), |_key, value| {
count += 1;
let bits = GroupState::from(value)?;
assert_eq!(bits.count(), 256);
assert!(bits.is_full());
Ok(())
})
.unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_meta_store_open_failed() {
let config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: "/proc/test".into(),
create: true,
..Default::default()
},
..Default::default()
};
assert!(MetaStore::open(&config).is_err());
}
#[test]
fn test_meta_store_update_used_size() {
let dir = tempfile::tempdir().unwrap();
let config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
},
prefix_len: 4,
};
let meta_store = MetaStore::open(&config).unwrap();
let chunk_id = [0, 1, 2, 3];
let group_id = GroupId::default();
let chunk_meta = ChunkMeta {
pos: Position::new(group_id, 0_u8),
..Default::default()
};
meta_store
.add_chunk(&chunk_id[..3], &chunk_meta, false)
.unwrap_err();
meta_store.add_chunk(&chunk_id, &chunk_meta, false).unwrap();
meta_store.query_used_size(&chunk_id[..3]).unwrap_err();
assert_eq!(
meta_store.query_used_size(&chunk_id).unwrap(),
CHUNK_SIZE_NORMAL
);
assert_eq!(meta_store.query_used_size(&0u32.to_le_bytes()).unwrap(), 0);
meta_store
.query_chunks_by_timestamp(&0u32.to_le_bytes(), 0, u64::MAX, u64::MAX)
.unwrap();
meta_store.remove(&chunk_id, &chunk_meta, false).unwrap();
assert_eq!(meta_store.query_used_size(&chunk_id).unwrap(), 0);
let key = MetaKey::used_size_key(&chunk_id);
meta_store.rocksdb.put(key, [], false).unwrap();
meta_store.query_used_size(&chunk_id).unwrap_err();
meta_store
.rocksdb
.put(MetaKey::used_size_prefix_len_key(), [233], false)
.unwrap();
drop(meta_store);
assert!(MetaStore::open(&config).is_err());
}
#[test]
fn test_meta_store_update_used_size_prefix_len() {
let dir = tempfile::tempdir().unwrap();
let mut config = MetaStoreConfig {
rocksdb: RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
},
prefix_len: 0,
};
const N: u64 = 1024;
let start = ChunkMeta::now();
let meta_store = MetaStore::open(&config).unwrap();
for i in 0..N {
let chunk_id = i.to_le_bytes();
let id = i as u8;
let chunk_size = if id % 2 == 0 {
CHUNK_SIZE_NORMAL
} else {
CHUNK_SIZE_SMALL
};
let pos = Position::new(GroupId::new(chunk_size, 0, 0), id);
let meta = ChunkMeta {
pos,
..Default::default()
};
meta_store.add_chunk(&chunk_id, &meta, false).unwrap();
}
let size = meta_store.query_used_size(&[]).unwrap();
assert_eq!(size, N / 2 * (CHUNK_SIZE_NORMAL.0 + CHUNK_SIZE_SMALL.0));
let mut write_batch = RocksDB::new_write_batch();
write_batch.put("m", "m");
meta_store.write(write_batch, false).unwrap();
let end = ChunkMeta::now();
let vec = meta_store
.query_chunks_by_timestamp(&[], 0, start, u64::MAX)
.unwrap();
assert!(vec.is_empty());
let vec = meta_store
.query_chunks_by_timestamp(&[], start, end + 1, u64::MAX)
.unwrap();
assert_eq!(vec.len(), N as usize);
drop(meta_store);
config.prefix_len = 1;
let meta_store = MetaStore::open(&config).unwrap();
for i in 0..=u8::MAX {
let size = meta_store.query_used_size(&[i]).unwrap();
if i % 2 == 0 {
assert_eq!(size, N / 256 * CHUNK_SIZE_NORMAL.0);
} else {
assert_eq!(size, N / 256 * CHUNK_SIZE_SMALL.0);
}
}
for i in 0..N {
let chunk_id = i.to_le_bytes();
let meta = meta_store.get_chunk_meta(&chunk_id).unwrap().unwrap();
meta_store.remove(&chunk_id, &meta, false).unwrap();
}
for i in 0..=u8::MAX {
let size = meta_store.query_used_size(&[i]).unwrap();
assert_eq!(size, 0);
}
drop(meta_store);
config.prefix_len = 0;
let meta_store = MetaStore::open(&config).unwrap();
let size = meta_store.query_used_size(&[]).unwrap();
assert_eq!(size, 0);
}
}

View File

@@ -0,0 +1,9 @@
mod meta_key;
mod meta_merge;
mod meta_store;
mod rocksdb;
pub use meta_key::*;
pub use meta_merge::*;
pub use meta_store::*;
pub use rocksdb::*;

View File

@@ -0,0 +1,314 @@
use crate::{Error, Result, Size};
use std::path::PathBuf;
#[derive(Debug, Default, Clone)]
pub struct RocksDBConfig {
pub path: PathBuf,
pub create: bool,
pub read_only: bool,
}
pub struct RocksDB {
db: rocksdb::DB,
write_options: [rocksdb::WriteOptions; 2], // 0 for non-sync, 1 for sync.
}
pub trait MergeOp {
fn full_merge<'a>(
key: &[u8],
value: Option<&[u8]>,
operands: impl Iterator<Item = &'a [u8]>,
) -> Option<Vec<u8>>;
fn partial_merge<'a>(key: &[u8], operands: impl Iterator<Item = &'a [u8]>) -> Option<Vec<u8>>;
}
impl RocksDB {
pub fn open<T: MergeOp + 'static>(config: &RocksDBConfig) -> Result<Self> {
let mut db_options = rocksdb::Options::default();
db_options.create_if_missing(config.create);
db_options.set_merge_operator(
"merge",
|key, value, operands| T::full_merge(key, value, operands.iter()),
|key, _value, operands| T::partial_merge(key, operands.iter()),
);
let mut table_options = rocksdb::BlockBasedOptions::default();
table_options.set_bloom_filter(10.0, true);
db_options.set_block_based_table_factory(&table_options);
let db = if config.read_only {
rocksdb::DB::open_for_read_only(&db_options, &config.path, false)
} else {
rocksdb::DB::open(&db_options, &config.path)
}
.map_err(|err| Error::RocksDBError(format!("open rocksdb fail: {:?}", err)))?;
let mut sync_write_options = rocksdb::WriteOptions::new();
sync_write_options.set_sync(true);
Ok(Self {
db,
write_options: [rocksdb::WriteOptions::new(), sync_write_options],
})
}
pub fn get(&self, key: impl AsRef<[u8]>) -> Result<Option<rocksdb::DBPinnableSlice>> {
match self.db.get_pinned(key) {
Ok(v) => Ok(v),
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
}
}
pub fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>, sync: bool) -> Result<()> {
match self
.db
.put_opt(key, value, &self.write_options[sync as usize])
{
Ok(v) => Ok(v),
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
}
}
pub fn delete(&self, key: impl AsRef<[u8]>, sync: bool) -> Result<()> {
match self.db.delete_opt(key, &self.write_options[sync as usize]) {
Ok(v) => Ok(v),
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
}
}
pub fn new_write_batch() -> rocksdb::WriteBatch {
rocksdb::WriteBatch::default()
}
pub fn write(&self, batch: rocksdb::WriteBatch, sync: bool) -> Result<()> {
match self.db.write_opt(batch, &self.write_options[sync as usize]) {
Ok(v) => Ok(v),
Err(e) => Err(Error::RocksDBError(format!("RocksDB fail: {e:?}"))),
}
}
pub fn new_iterator(&self) -> RocksDBIterator {
let mut read_options = rocksdb::ReadOptions::default();
read_options.set_readahead_size(Size::mebibyte(4).into());
RocksDBIterator(self.db.raw_iterator_opt(read_options))
}
}
impl Drop for RocksDB {
fn drop(&mut self) {
tracing::info!("RocksDB {:?} is closing...", self.db);
}
}
pub struct RocksDBIterator<'a>(rocksdb::DBRawIterator<'a>);
impl RocksDBIterator<'_> {
pub fn iterate<P, Fn>(&mut self, prefix: P, mut func: Fn) -> Result<u32>
where
P: AsRef<[u8]>,
Fn: FnMut(&[u8], &[u8]) -> Result<()>,
{
let it = &mut self.0;
it.seek(prefix.as_ref());
let mut count = 0;
while it.valid() && it.key().unwrap().starts_with(prefix.as_ref()) {
func(it.key().unwrap(), it.value().unwrap_or(&[]))?;
it.next();
count += 1;
}
self.status()?;
Ok(count)
}
pub fn seek<P>(&mut self, prefix: P) -> Result<()>
where
P: AsRef<[u8]>,
{
self.0.seek(prefix.as_ref());
self.status()
}
pub fn valid(&self) -> bool {
self.0.valid()
}
pub fn status(&self) -> Result<()> {
self.0
.status()
.map_err(|e| Error::RocksDBError(e.to_string()))
}
pub fn next(&mut self) {
self.0.next();
}
pub fn key(&self) -> Option<&[u8]> {
self.0.key()
}
pub fn value(&self) -> Option<&[u8]> {
self.0.value()
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_rocksdb_create_get_set() {
use super::super::*;
let dir = tempfile::tempdir().unwrap();
let config = RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
};
let rocksdb = RocksDB::open::<MetaMergeOp>(&config).unwrap();
let value = rocksdb.get("merry".as_bytes()).unwrap();
assert!(value.is_none());
rocksdb
.put("merry".as_bytes(), "world".as_bytes(), false)
.unwrap();
let value = rocksdb.get("merry".as_bytes()).unwrap();
assert_eq!(value.as_deref(), Some("world".as_bytes()));
let mut batch = RocksDB::new_write_batch();
batch.put("merry", "RocksDB");
batch.put("peace", "love");
rocksdb.write(batch, false).unwrap();
let value = rocksdb.get("merry".as_bytes()).unwrap();
assert_eq!(value.as_deref(), Some("RocksDB".as_bytes()));
let value = rocksdb.get("peace".as_bytes()).unwrap();
assert_eq!(value.as_deref(), Some("love".as_bytes()));
let mut batch = RocksDB::new_write_batch();
batch.merge("merry", "1");
batch.merge("merry", "2");
for i in 0..16 {
batch.merge("merge", format!("{i}"));
}
rocksdb.write(batch, false).unwrap();
let value = rocksdb.get("merry".as_bytes()).unwrap();
assert_eq!(value.as_deref(), Some("RocksDB12".as_bytes()));
let value = rocksdb.get("merge".as_bytes()).unwrap();
assert_eq!(value.as_deref(), Some("0123456789101112131415".as_bytes()));
let mut it = rocksdb.new_iterator();
let mut count = 0;
let mut runner = |_: &[u8], _: &[u8]| {
count += 1;
crate::Result::Ok(())
};
assert_eq!(it.iterate([], &mut runner).unwrap(), 3);
assert_eq!(it.iterate("m", &mut runner).unwrap(), 2);
assert_eq!(it.iterate("a", &mut runner).unwrap(), 0);
assert_eq!(it.iterate("z", &mut runner).unwrap(), 0);
let config = RocksDBConfig {
path: std::path::Path::new("/proc/test").into(),
create: true,
..Default::default()
};
assert!(RocksDB::open::<MetaMergeOp>(&config).is_err());
}
#[test]
fn test_rocksdb_parallel_write() {
use super::super::*;
use std::sync::Arc;
let dir = tempfile::tempdir().unwrap();
let config = RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
};
let rocksdb = Arc::new(RocksDB::open::<MetaMergeOp>(&config).unwrap());
const T: usize = 16;
const N: usize = 1000;
let mut threads = vec![];
for i in 0..T {
let rocksdb = rocksdb.clone();
threads.push(
std::thread::Builder::new()
.name(format!("test-{i}"))
.spawn(move || {
for j in 0..N {
let value = [j as u8; 32];
let mut batch = RocksDB::new_write_batch();
batch.put(format!("a{}atesta", i * N + j), value);
batch.put(format!("b{}btestb", i * N + j), value);
batch.merge(format!("m{}mtestm", i * N + j), value);
rocksdb.write(batch, false).unwrap();
}
})
.unwrap(),
)
}
for thread in threads {
thread.join().unwrap();
}
}
#[test]
fn test_rocksdb_invalid_merge() {
use super::super::*;
let dir = tempfile::tempdir().unwrap();
let config = RocksDBConfig {
path: dir.path().into(),
create: true,
..Default::default()
};
let rocksdb = RocksDB::open::<MetaMergeOp>(&config).unwrap();
let mut batch = RocksDB::new_write_batch();
batch.merge("invalid_merge", "");
rocksdb.write(batch, false).unwrap();
assert!(rocksdb.get("invalid_merge").is_err());
let mut runner = |_: &[u8], _: &[u8]| crate::Result::Ok(());
let mut it = rocksdb.new_iterator();
assert!(it.iterate("invalid_merge", &mut runner).is_err());
assert!(it.seek("invalid_merge").is_err());
assert!(rocksdb.put("invalid_merge", "ok", false).is_ok());
assert!(rocksdb.get("invalid_merge").is_ok());
drop(it);
let mut it = rocksdb.new_iterator();
assert_eq!(it.iterate("invalid_merge", &mut runner), Ok(1));
it.seek("invalid_merge").unwrap();
assert!(it.valid());
assert_eq!(it.key().unwrap(), "invalid_merge".as_bytes());
assert_eq!(it.value().unwrap(), "ok".as_bytes());
it.next();
assert!(!it.valid());
assert!(it.status().is_ok());
drop(it);
drop(rocksdb);
let config = RocksDBConfig {
path: dir.path().into(),
create: false,
read_only: true,
};
RocksDB::open::<MetaMergeOp>(&config).unwrap();
}
}

View File

@@ -0,0 +1,84 @@
use super::super::*;
pub type ETag = tinyvec::TinyVec<[u8; 14]>;
#[derive(derse::Serialize, derse::Deserialize, Clone, PartialEq, Eq, Debug)]
#[repr(C)]
pub struct ChunkMeta {
pub pos: Position,
pub chain_ver: u32,
pub chunk_ver: u32,
pub len: u32,
pub checksum: u32,
pub timestamp: u64,
pub last_request_id: u64,
pub last_client_low: u64,
pub last_client_high: u64,
pub etag: ETag,
pub uncommitted: bool,
}
impl ChunkMeta {
pub fn now() -> u64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_micros() as _
}
pub fn set_default_etag_if_need(&mut self) {
if self.etag.is_empty() {
self.etag = ETag::from(format!("{:X}", self.checksum).as_bytes());
}
}
}
impl Default for ChunkMeta {
fn default() -> Self {
Self {
pos: Position::new(GroupId::new(Size::GB, 0, 0), 0),
chain_ver: 0,
chunk_ver: 0,
len: 0,
checksum: 0,
timestamp: Self::now(),
last_request_id: 0,
last_client_low: 0,
last_client_high: 0,
etag: Default::default(),
uncommitted: false,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use derse::{Deserialize, Serialize};
#[test]
fn test_chunk_meta_seralization() {
let ser = ChunkMeta {
pos: Position::new(GroupId::default(), 88),
chain_ver: 1,
chunk_ver: 1,
len: 2,
timestamp: 0,
etag: ETag::from(b"hello".as_slice()),
..Default::default()
};
let bytes: derse::DownwardBytes = ser.serialize().unwrap();
assert_eq!(
bytes.as_slice(),
&[
63, 88, 0, 0, 0, 0, 0, 8, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 5, b'h', b'e', b'l', b'l', b'o', 0,
]
);
let der = ChunkMeta::deserialize(&bytes[..]).unwrap();
assert_eq!(ser, der);
}
}

View File

@@ -0,0 +1,8 @@
use super::super::Size;
pub const CHUNK_SIZE_SMALL: Size = Size::kibibyte(64);
pub const CHUNK_SIZE_NORMAL: Size = Size::kibibyte(512);
pub const CHUNK_SIZE_LARGE: Size = Size::mebibyte(4);
pub const CHUNK_SIZE_ULTRA: Size = Size::mebibyte(64);
pub const CHUNK_SIZE_SHIFT: usize = 16; // 64KiB is 2^16
pub const CHUNK_SIZE_NUMBER: usize = 11; // from 64KiB to 64MiB

View File

@@ -0,0 +1,114 @@
use super::super::*;
#[derive(Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
pub struct GroupId(pub u64);
impl Default for GroupId {
fn default() -> Self {
GroupId::new(CHUNK_SIZE_NORMAL, 0, 0)
}
}
impl GroupId {
// 32bit chunk size + 24bit group + 8bit cluster
const SHIFT: u32 = 8;
pub const COUNT: u32 = (1 << Self::SHIFT);
pub const fn new(chunk_size: Size, cluster: u8, group: u32) -> Self {
Self(chunk_size.0 << 32 | (group << Self::SHIFT | cluster as u32) as u64)
}
pub const fn chunk_size(&self) -> Size {
Size::new(self.0 >> 32)
}
pub const fn cluster(&self) -> u8 {
self.0 as u8
}
pub const fn group(&self) -> u32 {
(self.0 as u32) >> Self::SHIFT
}
pub fn offset(&self) -> Size {
const MARKS: u64 = !(GroupId::COUNT - 1) as u64;
self.chunk_size() * (self.0 & MARKS)
}
pub fn size(&self) -> Size {
self.chunk_size() * GroupId::COUNT as u64
}
pub fn plus_one(&self) -> Self {
Self(self.0 + 1)
}
pub fn next(&mut self) {
self.0 += 1
}
pub const fn inner(&self) -> u64 {
self.0
}
}
impl From<u64> for GroupId {
fn from(value: u64) -> Self {
Self(value)
}
}
impl From<GroupId> for u64 {
fn from(val: GroupId) -> Self {
val.0
}
}
impl std::ops::Deref for GroupId {
type Target = u64;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl std::fmt::Debug for GroupId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"GroupId {{ chunk_size: {}, cluster: {}, group: {} }}",
self.chunk_size(),
self.cluster(),
self.group(),
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_id_next() {
let mut group_id = GroupId::default();
for _ in 0..1000 {
for i in 0..=255 {
let next = group_id.plus_one();
if i == 255 {
assert_eq!(group_id.chunk_size(), next.chunk_size());
assert_eq!(0, next.cluster());
assert_eq!(group_id.group() + 1, next.group());
} else {
assert_eq!(group_id.chunk_size(), next.chunk_size());
assert_eq!(group_id.cluster() + 1, next.cluster());
assert_eq!(group_id.group(), next.group());
}
group_id = next;
}
}
let value = u64::from(group_id);
assert_eq!(value, group_id.0);
}
}

View File

@@ -0,0 +1,163 @@
use super::super::*;
use std::num::NonZeroU64;
type Item = u64;
type Bits = [Item; 4];
#[derive(Debug, PartialEq, Copy, Clone)]
pub struct GroupState {
bits: Bits,
count: u32,
}
impl GroupState {
const TOTAL_BYTES: usize = std::mem::size_of::<Bits>();
pub const TOTAL_BITS: usize = 8 * Self::TOTAL_BYTES;
pub const ITEM_BITS: u8 = 8 * std::mem::size_of::<Item>() as u8;
pub const LEN: usize = Self::TOTAL_BYTES / std::mem::size_of::<Item>();
pub const LEVELS: usize = 4;
pub fn from(value: &[u8]) -> Result<Self> {
let mut out = Self::empty();
if value.len() != Self::TOTAL_BYTES {
return Err(Error::MetaError(format!(
"group state load bytes {} != {}",
value.len(),
Self::TOTAL_BYTES
)));
}
out.as_mut_bytes().copy_from_slice(value);
out.count = out.bits.iter().map(|b| b.count_ones()).sum();
Ok(out)
}
pub const fn empty() -> Self {
Self {
bits: [0; Self::LEN],
count: 0,
}
}
pub fn full() -> Self {
Self {
bits: [!0; Self::LEN],
count: Self::TOTAL_BITS as u32,
}
}
pub fn is_empty(&self) -> bool {
self.count == 0
}
pub fn is_full(&self) -> bool {
self.count == Self::TOTAL_BITS as u32
}
pub fn allocate(&mut self) -> Option<u8> {
for (i, v) in self.bits.iter_mut().enumerate() {
if let Some(mark) = NonZeroU64::new(!*v) {
let idx = mark.trailing_zeros();
*v |= 1 << idx;
self.count += 1;
return Some(i as u8 * Self::ITEM_BITS + idx as u8);
} else {
continue;
}
}
None
}
pub fn count(&self) -> u32 {
self.count
}
pub fn level(&self) -> u32 {
self.count() / (Self::TOTAL_BITS / Self::LEVELS) as u32
}
pub fn check(&self, index: u8) -> bool {
let x = index / Self::ITEM_BITS;
let y = index % Self::ITEM_BITS;
self.bits[x as usize] & (1 << y) != 0
}
pub fn deallocate(&mut self, index: u8) -> Result<()> {
let x = index / Self::ITEM_BITS;
let y = index % Self::ITEM_BITS;
let mark = &mut self.bits[x as usize];
if *mark & (1 << y) != 0 {
*mark ^= 1 << y;
self.count -= 1;
Ok(())
} else {
Err(Error::MetaError(format!(
"group state deallocate fail: index {}",
index
)))
}
}
pub fn update(&mut self, merge_bits: &MergeState) {
for pos in &merge_bits.acquire {
let x = pos / Self::ITEM_BITS;
let y = pos % Self::ITEM_BITS;
self.bits[x as usize] |= 1 << y;
}
for pos in &merge_bits.release {
let x = pos / Self::ITEM_BITS;
let y = pos % Self::ITEM_BITS;
self.bits[x as usize] &= !(1 << y);
}
self.count = self.bits.iter().map(|b| b.count_ones()).sum();
}
pub fn as_bytes(&self) -> &[u8; Self::TOTAL_BYTES] {
unsafe { std::mem::transmute(&self.bits) }
}
pub fn as_mut_bytes(&mut self) -> &mut [u8; Self::TOTAL_BYTES] {
unsafe { std::mem::transmute(&mut self.bits) }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_bits_normal() {
use rand::seq::SliceRandom;
let mut group_state = GroupState::empty();
assert_eq!(group_state.count(), 0);
for i in 0..=255 {
assert_eq!(i, group_state.allocate().unwrap());
}
assert!(group_state.allocate().is_none());
assert_eq!(group_state.count(), 256);
let mut vec = (0..=255).collect::<Vec<u8>>();
vec.shuffle(&mut rand::thread_rng());
for i in vec {
group_state.deallocate(i).unwrap();
group_state.deallocate(i).unwrap_err();
let j = group_state.allocate().unwrap();
group_state.deallocate(j).unwrap();
group_state.deallocate(j).unwrap_err();
}
assert_eq!(group_state.count(), 0);
group_state.allocate().unwrap();
group_state.allocate().unwrap();
group_state.deallocate(0).unwrap();
assert!(group_state.check(1));
let bytes = group_state.as_bytes();
let another_state = GroupState::from(bytes).unwrap();
assert_eq!(another_state, group_state);
assert!(GroupState::from(&bytes[1..]).is_err());
}
}

View File

@@ -0,0 +1,89 @@
use std::collections::HashSet;
use super::super::*;
use derse::Deserialize;
#[derive(Clone, Debug, Default, derse::Deserialize, derse::Serialize, PartialEq)]
pub struct MergeState {
pub acquire: HashSet<u8>,
pub release: HashSet<u8>,
}
impl MergeState {
pub fn empty() -> Self {
Self::default()
}
pub fn from(value: &[u8]) -> Result<Self> {
Self::deserialize(value).map_err(Error::SerializationError)
}
pub fn acquire(pos: u8) -> Self {
let mut b = Self::empty();
b.acquire.insert(pos);
b
}
pub fn release(pos: u8) -> Self {
let mut b = Self::empty();
b.release.insert(pos);
b
}
pub fn merge(&mut self, right: &Self) {
for pos in &right.acquire {
self.acquire.insert(*pos);
self.release.remove(pos);
}
for pos in &right.release {
self.acquire.remove(pos);
self.release.insert(*pos);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_merge_bits() {
fn group_bits_apply(mut bits: GroupState, merge_bits: &MergeState) -> GroupState {
bits.update(merge_bits);
bits
}
let state = GroupState::empty();
assert_eq!(group_bits_apply(state, &MergeState::empty()), state);
let acquire_first_bit = MergeState::acquire(0);
let state_after_acquire = group_bits_apply(state, &acquire_first_bit);
assert_eq!(state_after_acquire.as_bytes()[0], 1);
assert_eq!(state_after_acquire.as_bytes()[1..], state.as_bytes()[1..]);
let release_first_bit = MergeState::release(0);
let state_after_release = group_bits_apply(state_after_acquire, &release_first_bit);
assert_eq!(state_after_release, state);
let mut merge_bits = acquire_first_bit;
merge_bits.merge(&release_first_bit);
assert_eq!(merge_bits, release_first_bit);
assert_eq!(state, group_bits_apply(state, &merge_bits));
let mut merge_bits = MergeState::empty();
for i in 0..=255 {
merge_bits.merge(&MergeState::acquire(i));
}
let full_state = group_bits_apply(state, &merge_bits);
assert!(full_state.is_full());
for i in 0..=255 {
merge_bits.merge(&MergeState::release(i));
}
let empty_state = group_bits_apply(full_state, &merge_bits);
assert_eq!(empty_state, state);
assert!(MergeState::from(&[]).is_err());
}
}

View File

@@ -0,0 +1,13 @@
mod chunk_meta;
mod constants;
mod group_id;
mod group_state;
mod merge_state;
mod position;
pub use chunk_meta::*;
pub use constants::*;
pub use group_id::*;
pub use group_state::*;
pub use merge_state::*;
pub use position::*;

View File

@@ -0,0 +1,117 @@
use super::super::*;
use derse::{Deserialize, Deserializer, Serialize, Serializer};
#[derive(Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
#[repr(C)]
pub struct Position(pub u64);
impl Position {
const SHIFT: u32 = 8;
// 24bit chunk size + 8bit cluster + 24bit group + 8bit zero
pub const fn new(group_id: GroupId, index: u8) -> Self {
const CLEAN: u64 = !((GroupId::COUNT - 1) as u64);
Self(group_id.inner() & CLEAN | index as u64 | (group_id.cluster() as u64) << 32)
}
pub fn group_id(&self) -> GroupId {
const MARKS: u64 = (GroupId::COUNT - 1) as u64;
const CLEAN: u64 = !(MARKS | MARKS << 32);
GroupId::from(self.0 & CLEAN | self.cluster() as u64)
}
pub fn chunk_size(&self) -> Size {
Size::new(self.0 >> 40 << 8)
}
pub fn cluster(&self) -> u8 {
(self.0 >> 32) as u8
}
pub fn group(&self) -> u32 {
(self.0 as u32) >> Self::SHIFT
}
pub fn index(&self) -> u8 {
self.0 as u8
}
pub fn offset(&self) -> Size {
self.chunk_size() * self.0 as u32 as u64
}
}
impl Default for Position {
fn default() -> Self {
Position::new(GroupId::default(), 0)
}
}
impl From<u64> for Position {
fn from(value: u64) -> Self {
Self(value)
}
}
impl std::ops::Deref for Position {
type Target = u64;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl std::fmt::Debug for Position {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Position {{ chunk_size: {}, cluster: {}, group: {}, index: {} }}",
self.chunk_size(),
self.cluster(),
self.group(),
self.index(),
)
}
}
impl Serialize for Position {
fn serialize_to<T: Serializer>(&self, serializer: &mut T) -> derse::Result<()> {
self.0.serialize_to(serializer)
}
}
impl<'a> Deserialize<'a> for Position {
fn deserialize_from<T: Deserializer<'a>>(buf: &mut T) -> derse::Result<Self> {
Ok(Self(u64::deserialize_from(buf)?))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_id_and_position() {
let group_id = GroupId::new(64 * Size::KB, 23, 233);
assert_eq!(group_id.chunk_size(), 64 * Size::KB);
assert_eq!(group_id.cluster(), 23);
assert_eq!(group_id.group(), 233);
assert_eq!(
format!("{:?}", group_id),
"GroupId { chunk_size: 64KiB, cluster: 23, group: 233 }"
);
let position = Position::new(group_id, 223);
assert_eq!(position.chunk_size(), 64 * Size::KB);
assert_eq!(position.cluster(), 23);
assert_eq!(position.group(), 233);
assert_eq!(position.index(), 223);
assert_eq!(position.group_id(), group_id);
assert_eq!(position.to_be_bytes().len(), 8);
assert_eq!(
format!("{:?}", position),
"Position { chunk_size: 64KiB, cluster: 23, group: 233, index: 223 }"
);
}
}

View File

@@ -0,0 +1,21 @@
use super::super::Size;
pub const ALIGN_SIZE: Size = Size::new(512);
pub fn create_aligned_vec(size: Size) -> Vec<u8> {
let s: usize = size.into();
let layout = std::alloc::Layout::from_size_align(s, ALIGN_SIZE.into()).unwrap();
unsafe { Vec::from_raw_parts(std::alloc::alloc(layout), s, s) }
}
pub fn is_aligned_buf(data: &[u8]) -> bool {
data.as_ptr() as u64 % ALIGN_SIZE.0 == 0 && data.len() as u64 % ALIGN_SIZE.0 == 0
}
pub fn is_aligned_len(len: u32) -> bool {
len % ALIGN_SIZE.0 as u32 == 0
}
pub fn is_aligned_io(data: &[u8], offset: u32) -> bool {
is_aligned_buf(data) && is_aligned_len(offset)
}

View File

@@ -0,0 +1 @@
pub type Bytes = tinyvec::TinyVec<[u8; 28]>;

View File

@@ -0,0 +1,15 @@
mod aligned;
mod bytes;
mod result;
mod shards_map;
mod shards_set;
mod size;
mod worker;
pub use aligned::*;
pub use bytes::*;
pub use result::*;
pub use shards_map::*;
pub use shards_set::*;
pub use size::*;
pub use worker::*;

View File

@@ -0,0 +1,34 @@
#[derive(Debug, PartialEq)]
pub enum Error {
IoError(String),
RocksDBError(String),
MetaError(String),
InvalidArg(String),
SerializationError(derse::Error),
ChecksumMismatch(String),
ChainVersionMismatch(String),
ChunkETagMismatch(String),
ChunkAlreadyExists,
ChunkCommittedUpdate(String),
ChunkMissingUpdate(String),
NoSpace,
}
pub type Result<T> = std::result::Result<T, Error>;
impl std::fmt::Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(self, f)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_error_display() {
let error = Error::InvalidArg("invalid pos".into());
assert_eq!(error.to_string(), r#"InvalidArg("invalid pos")"#);
}
}

View File

@@ -0,0 +1,153 @@
use std::borrow::Borrow;
use std::collections::{
hash_map::{DefaultHasher, Entry},
HashMap,
};
use std::hash::{Hash, Hasher};
pub struct ShardsMap<K, V, const S: usize = 64> {
shards: [HashMap<K, V>; S],
}
pub struct ShardsMapIter<'a, K, V> {
array_it: std::slice::Iter<'a, HashMap<K, V>>,
inner_it: std::collections::hash_map::Iter<'a, K, V>,
}
impl<K, V, const S: usize> ShardsMap<K, V, S>
where
K: Eq + Hash,
{
pub fn new() -> Self {
Self {
shards: [(); S].map(|_| Default::default()),
}
}
pub fn with_capacity(capacity: usize) -> Self {
let cap = (capacity / S).next_power_of_two();
Self {
shards: [(); S].map(|_| HashMap::with_capacity(cap)),
}
}
fn shard<Q>(key: &Q) -> usize
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
let mut s = DefaultHasher::new();
key.hash(&mut s);
s.finish() as usize % S
}
pub fn get<Q>(&self, k: &Q) -> Option<&V>
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
self.shards[Self::shard(k)].get(k)
}
pub fn get_mut<Q>(&mut self, k: &Q) -> Option<&mut V>
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
self.shards[Self::shard(k)].get_mut(k)
}
pub fn is_empty(&self) -> bool {
self.shards.iter().all(|m| m.is_empty())
}
pub fn len(&self) -> usize {
self.shards.iter().map(|m| m.len()).sum()
}
pub fn iter(&self) -> ShardsMapIter<'_, K, V> {
ShardsMapIter {
array_it: self.shards[1..].iter(),
inner_it: self.shards[0].iter(),
}
}
pub fn insert(&mut self, k: K, v: V) -> Option<V> {
self.shards[Self::shard(&k)].insert(k, v)
}
pub fn entry(&mut self, k: K) -> Entry<'_, K, V> {
self.shards[Self::shard(&k)].entry(k)
}
pub fn remove<Q>(&mut self, k: &Q) -> Option<V>
where
K: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
self.shards[Self::shard(k)].remove(k)
}
}
impl<K, V, const S: usize> Default for ShardsMap<K, V, S>
where
K: Eq + Hash,
{
fn default() -> Self {
Self::new()
}
}
impl<'a, K, V> Iterator for ShardsMapIter<'a, K, V> {
type Item = (&'a K, &'a V);
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(value) = self.inner_it.next() {
return Some(value);
} else if let Some(map) = self.array_it.next() {
self.inner_it = map.iter();
} else {
return None;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_shards_map() {
let mut map = ShardsMap::<usize, usize, 4>::with_capacity(1024);
assert!(map.is_empty());
assert_eq!(map.len(), 0);
const N: usize = 1024;
for i in 0..N {
assert!(map.get(&i).is_none());
map.insert(i, i * i);
}
assert!(!map.is_empty());
assert_eq!(map.len(), N);
assert_eq!(
map.iter()
.map(|(k, v)| {
assert_eq!(k * k, *v);
})
.count(),
N
);
for i in 0..N {
let value = map.get_mut(&i).unwrap();
assert_eq!(i * i, *value);
map.entry(i).and_modify(|v| *v += 1);
assert_eq!(map.remove(&i).unwrap(), i * i + 1);
}
assert!(ShardsMap::<usize, usize, 4>::default().is_empty());
}
}

View File

@@ -0,0 +1,128 @@
use std::borrow::Borrow;
use std::collections::{hash_map::DefaultHasher, HashSet};
use std::hash::{Hash, Hasher};
pub struct ShardsSet<T, const S: usize = 64> {
shards: [HashSet<T>; S],
}
pub struct ShardsSetIter<'a, T> {
array_it: std::slice::Iter<'a, HashSet<T>>,
inner_it: std::collections::hash_set::Iter<'a, T>,
}
impl<T, const S: usize> ShardsSet<T, S>
where
T: Eq + Hash,
{
pub fn new() -> Self {
Self {
shards: [(); S].map(|_| Default::default()),
}
}
pub fn with_capacity(capacity: usize) -> Self {
let cap = (capacity / S).next_power_of_two();
Self {
shards: [(); S].map(|_| HashSet::with_capacity(cap)),
}
}
fn shard<Q>(key: &Q) -> usize
where
T: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
let mut s = DefaultHasher::new();
key.hash(&mut s);
s.finish() as usize % S
}
pub fn contains<Q>(&self, value: &Q) -> bool
where
T: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
self.shards[Self::shard(value)].contains(value)
}
pub fn is_empty(&self) -> bool {
self.shards.iter().all(|m| m.is_empty())
}
pub fn len(&self) -> usize {
self.shards.iter().map(|m| m.len()).sum()
}
pub fn iter(&self) -> ShardsSetIter<'_, T> {
ShardsSetIter {
array_it: self.shards[1..].iter(),
inner_it: self.shards[0].iter(),
}
}
pub fn insert(&mut self, value: T) -> bool {
self.shards[Self::shard(&value)].insert(value)
}
pub fn remove<Q>(&mut self, value: &Q) -> bool
where
T: Borrow<Q>,
Q: Eq + Hash + ?Sized,
{
self.shards[Self::shard(value)].remove(value)
}
}
impl<T, const S: usize> Default for ShardsSet<T, S>
where
T: Eq + Hash,
{
fn default() -> Self {
Self::new()
}
}
impl<'a, T> Iterator for ShardsSetIter<'a, T> {
type Item = &'a T;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(value) = self.inner_it.next() {
return Some(value);
} else if let Some(map) = self.array_it.next() {
self.inner_it = map.iter();
} else {
return None;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_shards_map() {
let mut set = ShardsSet::<usize, 4>::with_capacity(1024);
assert!(set.is_empty());
assert_eq!(set.len(), 0);
const N: usize = 1024;
for i in 0..N {
assert!(!set.contains(&i));
assert!(set.insert(i));
}
assert!(!set.is_empty());
assert_eq!(set.len(), N);
for i in 0..N {
assert!(set.contains(&i));
assert!(set.remove(&i));
assert!(!set.remove(&i));
}
assert!(ShardsSet::<usize>::default().is_empty());
}
}

View File

@@ -0,0 +1,283 @@
#[derive(Default, Copy, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
#[repr(C)]
pub struct Size(pub u64);
impl Size {
pub const B: Size = Size::byte(1);
pub const KB: Size = Size::kibibyte(1);
pub const MB: Size = Size::mebibyte(1);
pub const GB: Size = Size::gibibyte(1);
pub const TB: Size = Size::tebibyte(1);
pub const fn new(v: u64) -> Size {
Size(v)
}
pub const fn zero() -> Size {
Size::new(0)
}
pub const fn byte(value: u64) -> Size {
Size::new(value)
}
pub const fn kibibyte(value: u64) -> Size {
Size::new(value << 10)
}
pub const fn mebibyte(value: u64) -> Size {
Size::new(value << 20)
}
pub const fn gibibyte(value: u64) -> Size {
Size::new(value << 30)
}
pub const fn tebibyte(value: u64) -> Size {
Size::new(value << 40)
}
pub fn around(&self) -> String {
if self.0 == 0 {
"0B".to_string()
} else if *self * 2 >= Self::TB {
format!("{:.2}TiB", (self.0 as f64 / Self::TB.0 as f64))
} else if *self * 2 >= Self::GB {
format!("{:.2}GiB", (self.0 as f64 / Self::GB.0 as f64))
} else if *self * 2 >= Self::MB {
format!("{:.2}MiB", (self.0 as f64 / Self::MB.0 as f64))
} else if *self * 2 >= Self::KB {
format!("{:.2}KiB", (self.0 as f64 / Self::KB.0 as f64))
} else {
format!("{}B", self.0)
}
}
pub fn is_power_of_two(&self) -> bool {
self.0.is_power_of_two()
}
pub fn next_power_of_two(&self) -> Size {
Size(self.0.next_power_of_two())
}
pub fn trailing_zeros(&self) -> u32 {
self.0.trailing_zeros()
}
}
macro_rules! impl_trait_for_size {
($($t:ty),*) => {
$(impl From<$t> for Size {
fn from(value: $t) -> Self {
Self::new(value as _)
}
}
impl From<Size> for $t {
fn from(val: Size) -> Self {
val.0 as _
}
}
impl PartialEq<$t> for Size {
fn eq(&self, other: &$t) -> bool {
self.0 == *other as u64
}
}
impl PartialEq<Size> for $t {
fn eq(&self, other: &Size) -> bool {
*self as u64 == other.0
}
}
impl std::ops::Add<$t> for Size {
type Output = Size;
fn add(self, rhs: $t) -> Self::Output {
Size::new(self.0 + rhs as u64)
}
}
impl std::ops::Add<Size> for $t {
type Output = Size;
fn add(self, rhs: Size) -> Self::Output {
Size::new(self as u64 + rhs.0)
}
}
impl std::ops::AddAssign<$t> for Size {
fn add_assign(&mut self, rhs: $t) {
self.0 += rhs as u64;
}
}
impl std::ops::Mul<$t> for Size {
type Output = Size;
fn mul(self, rhs: $t) -> Self::Output {
Size::new(self.0 * rhs as u64)
}
}
impl std::ops::Mul<Size> for $t {
type Output = Size;
fn mul(self, rhs: Size) -> Self::Output {
Size::new(self as u64 * rhs.0)
}
}
impl std::ops::MulAssign<$t> for Size {
fn mul_assign(&mut self, rhs: $t) {
self.0 *= rhs as u64;
}
}
impl std::ops::Rem<$t> for Size {
type Output = Size;
fn rem(self, rhs: $t) -> Self::Output {
Size::new(self.0 % rhs as u64)
}
}
impl std::ops::Rem<Size> for $t {
type Output = Size;
fn rem(self, rhs: Size) -> Self::Output {
Size::new(self as u64 % rhs.0)
}
}
)*
};
}
impl_trait_for_size! {i32, i64, u32, u64, usize}
impl std::ops::Add<Self> for Size {
type Output = Size;
fn add(self, rhs: Self) -> Self::Output {
Size::new(self.0 + rhs.0)
}
}
impl std::ops::AddAssign<Self> for Size {
fn add_assign(&mut self, rhs: Self) {
self.0 += rhs.0;
}
}
impl std::ops::Sub<Self> for Size {
type Output = Size;
fn sub(self, rhs: Self) -> Self::Output {
Size::new(self.0 - rhs.0)
}
}
impl std::ops::Mul<Self> for Size {
type Output = Size;
fn mul(self, rhs: Self) -> Self::Output {
Size::new(self.0 * rhs.0)
}
}
impl std::ops::Div<Self> for Size {
type Output = Size;
fn div(self, rhs: Self) -> Self::Output {
Size::new(self.0 / rhs.0)
}
}
impl std::ops::Rem<Self> for Size {
type Output = Size;
fn rem(self, rhs: Self) -> Self::Output {
Size::new(self.0 % rhs.0)
}
}
impl std::fmt::Display for Size {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 == 0 {
write!(f, "0B")
} else if *self % Self::TB == 0 {
write!(f, "{}TiB", (*self / Self::TB).0)
} else if *self % Self::GB == 0 {
write!(f, "{}GiB", (*self / Self::GB).0)
} else if *self % Self::MB == 0 {
write!(f, "{}MiB", (*self / Self::MB).0)
} else if *self % Self::KB == 0 {
write!(f, "{}KiB", (*self / Self::KB).0)
} else {
write!(f, "{}B", self.0)
}
}
}
impl std::fmt::Debug for Size {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.around())
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_size() {
use super::Size;
let size = Size::zero();
assert_eq!(size, Size::new(0));
assert_eq!(size.to_string(), "0B".to_string());
let size = Size::kibibyte(64);
assert_eq!(size, Size::new(65536));
assert_eq!(size.to_string(), "64KiB".to_string());
let size: Size = Size::MB * 23;
assert_eq!(size, Size::new(23 << 20));
assert_eq!(size.to_string(), "23MiB".to_string());
let size: Size = 233 * Size::GB;
assert_eq!(size, Size::new(233 << 30));
assert_eq!(size.to_string(), "233GiB".to_string());
assert_eq!(format!("{}", Size::zero()), "0B".to_string());
assert_eq!(format!("{}", Size::byte(233)), "233B".to_string());
assert_eq!(format!("{}", Size::byte(512)), "512B".to_string());
assert_eq!(format!("{}", Size::kibibyte(512)), "512KiB".to_string());
assert_eq!(format!("{}", Size::mebibyte(512)), "512MiB".to_string());
assert_eq!(format!("{}", Size::gibibyte(512)), "512GiB".to_string());
assert_eq!(format!("{}", Size::tebibyte(512)), "512TiB".to_string());
assert_eq!(format!("{:?}", Size::zero()), "0B".to_string());
assert_eq!(format!("{:?}", Size::byte(233)), "233B".to_string());
assert_eq!(format!("{:?}", Size::byte(512)), "0.50KiB".to_string());
assert_eq!(format!("{:?}", Size::kibibyte(512)), "0.50MiB".to_string());
assert_eq!(format!("{:?}", Size::mebibyte(512)), "0.50GiB".to_string());
assert_eq!(format!("{:?}", Size::gibibyte(512)), "0.50TiB".to_string());
assert_eq!(format!("{:?}", Size::tebibyte(512)), "512.00TiB".to_owned());
let r = rand::random::<u64>() % 1024;
assert_eq!(0 + Size::kibibyte(r), Size::from(r << 10));
assert_eq!(Size::mebibyte(r) + 0, Size::from(r << 20));
assert_eq!(1 * Size::gibibyte(r), Size::from(r << 30));
assert_eq!(Size::tebibyte(r) * 1, Size::from(r << 40));
assert_eq!(Size::KB * Size::KB, Size::MB);
let mut size = Size::B;
size *= 1024;
assert_eq!(size, Size::KB);
assert_eq!(size % 1000, 24);
assert_eq!(Size::KB + Size::KB, Size::kibibyte(2));
assert_eq!(Size::KB % 1000, Size(24));
}
}

View File

@@ -0,0 +1,136 @@
use std::{
sync::{
atomic::{AtomicBool, Ordering},
Arc, Condvar, Mutex,
},
thread::JoinHandle,
};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum WorkerState {
Continue,
Pause,
Wait(std::time::Duration),
Stop,
}
#[derive(Default)]
pub struct WorkerBuilder {
name: Option<String>,
condvar: Option<Arc<Condvar>>,
}
impl WorkerBuilder {
pub fn name(mut self, str: String) -> Self {
self.name = Some(str);
self
}
pub fn cond(mut self, condvar: Arc<Condvar>) -> Self {
self.condvar = Some(condvar);
self
}
pub fn spawn<F>(self, f: F) -> Worker
where
F: FnMut() -> WorkerState + Send + 'static,
{
Worker::new(f, self.name, self.condvar)
}
}
pub struct Worker {
stopping: Arc<AtomicBool>,
condvar: Arc<Condvar>,
handle: Option<JoinHandle<()>>,
}
impl Worker {
pub fn new<F>(mut f: F, name: Option<String>, condvar: Option<Arc<Condvar>>) -> Worker
where
F: FnMut() -> WorkerState + Send + 'static,
{
let stopping = Arc::new(AtomicBool::default());
let stopping_clone = stopping.clone();
let condvar = condvar.unwrap_or_default();
let condvar_clone = condvar.clone();
let builder = if let Some(name) = name {
std::thread::Builder::new().name(name)
} else {
std::thread::Builder::new()
};
let handle = Some(
builder
.spawn(move || {
let mutex = Mutex::new(());
while !stopping_clone.load(Ordering::Acquire) {
match f() {
WorkerState::Continue => continue,
WorkerState::Pause => {
drop(condvar_clone.wait(mutex.lock().unwrap()).unwrap());
}
WorkerState::Wait(duration) => {
drop(
condvar_clone
.wait_timeout(mutex.lock().unwrap(), duration)
.unwrap(),
);
}
WorkerState::Stop => break,
}
}
})
.unwrap(),
);
Worker {
stopping,
condvar,
handle,
}
}
pub fn stop_and_join(&mut self) {
self.stopping.store(true, Ordering::Release);
self.condvar.notify_all();
if let Some(handle) = self.handle.take() {
handle.join().unwrap();
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_worker() {
let count = Arc::new(std::sync::atomic::AtomicUsize::default());
let condvar = Default::default();
let count_clone = count.clone();
let mut worker = WorkerBuilder::default()
.name("Worker".into())
.cond(condvar)
.spawn(move || {
if count_clone.fetch_add(1, Ordering::SeqCst) + 1 < 10 {
WorkerState::Continue
} else {
WorkerState::Pause
}
});
while count.load(Ordering::Acquire) < 10 {
std::thread::sleep(std::time::Duration::from_millis(10));
}
worker.stop_and_join();
assert_eq!(count.load(Ordering::Acquire), 10);
}
#[test]
fn test_worker_2() {
let worker = WorkerBuilder::default().spawn(move || WorkerState::Stop);
let _ = worker.handle.unwrap().join();
}
}

View File

@@ -0,0 +1,183 @@
#include "storage/service/BufferPool.h"
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/Invoke.h>
#include <folly/experimental/coro/Task.h>
#include <sys/uio.h>
#include "common/monitor/Recorder.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/MagicEnum.hpp"
#include "fbs/storage/Common.h"
namespace hf3fs::storage {
namespace {
void alignBuffer(net::RDMABuf &rdmabuf) {
auto address = reinterpret_cast<uint64_t>(rdmabuf.ptr());
auto remain = address % kAIOAlignSize;
if (remain == 0) {
return;
}
auto crop = kAIOAlignSize - remain;
rdmabuf.advance(std::min(crop, rdmabuf.size()));
}
} // namespace
Result<Void> BufferPool::init(CPUExecutorGroup &executor) {
buffers_.clear();
buffers_.reserve(UIO_MAXIOV);
auto smallBufferResult =
initBuffers(executor, config_.rdmabuf_size(), config_.rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
RETURN_AND_LOG_ON_ERROR(smallBufferResult);
*freeIndex_.lock() = std::move(*smallBufferResult);
bigBufferRegisterIndexStart_ = buffers_.size();
auto bigBufferResult =
initBuffers(executor, config_.big_rdmabuf_size(), config_.big_rdmabuf_count(), UIO_MAXIOV / 2, buffers_);
RETURN_AND_LOG_ON_ERROR(bigBufferResult);
*bigFreeIndex_.lock() = std::move(*bigBufferResult);
iovecs_.clear();
iovecs_.reserve(buffers_.size());
for (auto &buf : buffers_) {
iovecs_.push_back({(void *)buf.ptr(), buf.size()});
}
return Void{};
}
Result<std::vector<BufferIndex>> BufferPool::initBuffers(CPUExecutorGroup &executor,
Size rdmabufSize,
uint32_t rdmabufCount,
uint32_t limit,
std::vector<net::RDMABuf> &outBuffers) {
size_t totalSize = rdmabufSize * rdmabufCount;
size_t bufferCount = std::min(limit, rdmabufCount);
size_t smallBufferCount = (totalSize / bufferCount + rdmabufSize - 1) / rdmabufSize;
size_t bufferSize = smallBufferCount * rdmabufSize;
auto pool = net::RDMABufPool::create(bufferSize, bufferCount);
std::vector<folly::coro::TaskWithExecutor<net::RDMABuf>> tasks;
tasks.reserve(bufferCount);
for (auto i = 0u; i < bufferCount; ++i) {
tasks.push_back(pool->allocate().scheduleOn(&executor.pickNext()));
}
XLOGF(INFO, "allocate {} * {} RDMA buffers started", bufferCount, Size{bufferSize});
auto buffers = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
XLOGF(INFO, "allocate {} * {} RDMA buffers finished", bufferCount, Size{bufferSize});
std::vector<BufferIndex> freeIndex;
freeIndex.reserve(rdmabufCount);
for (auto &buf : buffers) {
if (UNLIKELY(!buf)) {
auto msg = fmt::format("storage init buffer pool failed");
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
alignBuffer(buf);
BufferIndex bufferIndex;
bufferIndex.registerIndex = outBuffers.size();
outBuffers.push_back(buf);
auto split = buf;
for (; split.size() >= rdmabufSize; split.advance(rdmabufSize)) {
bufferIndex.buffer = split.first(rdmabufSize);
freeIndex.push_back(bufferIndex);
}
}
return Result<std::vector<BufferIndex>>(std::move(freeIndex));
}
BufferPool::Buffer::~Buffer() {
for (auto &index : indices_) {
pool_->deallocate(index);
}
}
Result<net::RDMABuf> BufferPool::Buffer::tryAllocate(uint32_t size) {
if (indices_.empty() || current_.size() < size) {
if (UNLIKELY(size > pool_->rdmabufSize_)) {
return makeError(StorageCode::kBufferSizeExceeded);
}
if (LIKELY(pool_->semaphore_.try_wait())) {
auto index = pool_->allocate();
indices_.push_back(index);
current_ = index.buffer;
} else {
return makeError(RPCCode::kRDMANoBuf);
}
}
auto ret = current_.takeFirst(size);
assert(ret);
alignBuffer(current_);
return ret;
}
CoTryTask<net::RDMABuf> BufferPool::Buffer::allocate(uint32_t size) {
if (indices_.empty() || current_.size() < size) {
if (UNLIKELY(size > pool_->bigRdmabufSize_)) {
co_return makeError(StorageCode::kBufferSizeExceeded);
} else if (UNLIKELY(size > pool_->rdmabufSize_)) {
co_await pool_->bigSemaphore_.co_wait();
auto index = pool_->allocateBig();
indices_.push_back(index);
current_ = index.buffer;
} else {
co_await pool_->semaphore_.co_wait();
auto index = pool_->allocate();
indices_.push_back(index);
current_ = index.buffer;
}
}
auto ret = current_.takeFirst(size);
assert(ret);
alignBuffer(current_);
co_return ret;
}
void BufferPool::clear(CPUExecutorGroup &executor) {
std::vector<folly::coro::TaskWithExecutor<void>> tasks;
tasks.reserve(buffers_.size());
for (auto &buffer : buffers_) {
tasks.push_back(folly::coro::co_invoke([&, buf = std::move(buffer)]() mutable -> CoTask<void> {
buf = {};
co_return;
}).scheduleOn(&executor.pickNext()));
}
XLOGF(INFO, "deallocate {} RDMA buffers started", buffers_.size());
folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
XLOGF(INFO, "deallocate {} RDMA buffers finished", buffers_.size());
}
BufferIndex BufferPool::allocate() {
auto guard = freeIndex_.lock();
assert(!guard->empty());
auto ret = guard->back();
guard->pop_back();
return ret;
}
BufferIndex BufferPool::allocateBig() {
auto guard = bigFreeIndex_.lock();
assert(!guard->empty());
auto ret = guard->back();
guard->pop_back();
return ret;
}
void BufferPool::deallocate(const BufferIndex &index) {
if (UNLIKELY(index.registerIndex >= bigBufferRegisterIndexStart_)) {
bigFreeIndex_.lock()->push_back(index);
bigSemaphore_.signal();
} else {
freeIndex_.lock()->push_back(index);
semaphore_.signal();
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,92 @@
#pragma once
#include <folly/Synchronized.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/fibers/Semaphore.h>
#include <limits>
#include "common/net/ib/RDMABuf.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/ConstructLog.h"
#include "common/utils/Size.h"
namespace hf3fs::storage {
struct BufferIndex {
uint32_t registerIndex;
net::RDMABuf buffer;
};
class BufferPool {
public:
class Config : public ConfigBase<Config> {
CONFIG_ITEM(rdmabuf_size, 4_MB);
CONFIG_ITEM(rdmabuf_count, 1024u);
CONFIG_ITEM(big_rdmabuf_size, 64_MB);
CONFIG_ITEM(big_rdmabuf_count, 64u);
};
BufferPool(const Config &config)
: config_(config),
rdmabufSize_(config_.rdmabuf_size()),
semaphore_(config_.rdmabuf_count()),
bigRdmabufSize_(config_.big_rdmabuf_size()),
bigSemaphore_(config_.big_rdmabuf_count()) {}
Result<Void> init(CPUExecutorGroup &executor);
auto &iovecs() const { return iovecs_; }
class Buffer {
public:
explicit Buffer(BufferPool &pool)
: pool_(&pool) {}
Buffer(const Buffer &) = delete;
Buffer(Buffer &&other) = default;
Buffer &operator=(Buffer &&other) = default;
~Buffer();
Result<net::RDMABuf> tryAllocate(uint32_t size);
CoTryTask<net::RDMABuf> allocate(uint32_t size);
auto index() const { return indices_.back().registerIndex; }
private:
BufferPool *pool_{};
std::vector<BufferIndex> indices_;
net::RDMABuf current_;
};
auto get() { return Buffer{*this}; }
void clear(CPUExecutorGroup &executor);
protected:
static Result<std::vector<BufferIndex>> initBuffers(CPUExecutorGroup &executor,
Size rdmabufSize,
uint32_t rdmabufCount,
uint32_t limit,
std::vector<net::RDMABuf> &outBuffers);
BufferIndex allocate();
BufferIndex allocateBig();
void deallocate(const BufferIndex &index);
private:
ConstructLog<"storage::BufferPool"> constructLog_;
const Config &config_;
Size rdmabufSize_;
std::vector<net::RDMABuf> buffers_;
std::vector<struct iovec> iovecs_;
folly::fibers::Semaphore semaphore_;
folly::Synchronized<std::vector<BufferIndex>, std::mutex> freeIndex_;
Size bigRdmabufSize_;
uint32_t bigBufferRegisterIndexStart_ = 0;
folly::fibers::Semaphore bigSemaphore_;
folly::Synchronized<std::vector<BufferIndex>, std::mutex> bigFreeIndex_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,263 @@
#include "storage/service/Components.h"
#include <folly/experimental/coro/BlockingWait.h>
#include "common/app/ApplicationBase.h"
#include "common/monitor/Recorder.h"
#include "common/utils/LogCommands.h"
#include "stubs/common/RealStubFactory.h"
#include "stubs/mgmtd/MgmtdServiceStub.h"
namespace hf3fs::storage {
namespace {
constexpr std::string_view kRoutingInfoListenerName = "Components";
monitor::ValueRecorder targetStateRecorder{"storage.target_state", std::nullopt, false};
} // namespace
Components::Components(const Config &config)
: config(config),
rdmabufPool(config.buffer_pool()),
storageTargets(config.targets(), targetMap),
aioReadWorker(config.aio_read_worker()),
messenger(config.forward_client()),
resyncWorker(config.sync_worker(), *this),
checkWorker(config.check_worker(), *this),
dumpWorker(config.dump_worker(), *this),
allocateWorker(config.allocate_worker(), *this),
punchHoleWorker(*this),
syncMetaKvWorker(config.sync_meta_kv_worker(), *this),
reliableForwarding(config.reliable_forwarding(), *this),
readPool(config.coroutines_pool_read(), "ReadPool"),
updatePool(config.coroutines_pool_update(), "UpdatePool"),
syncPool(config.coroutines_pool_default(), "SyncPool"),
defaultPool(config.coroutines_pool_default(), "DefaultPool"),
storageOperator(config.storage(), *this),
reliableUpdate(config.reliable_update(), *this) {}
Result<Void> Components::start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg) {
this->appInfo = appInfo;
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start rdmabufPool", rdmabufPool.init(tpg.procThreadPool()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start readPool", readPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start updatePool", updatePool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncPool", syncPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start defaultPool", defaultPool.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start messenger", messenger.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start reliableForwarding", reliableForwarding.init());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageTargets", storageTargets.load(tpg.procThreadPool()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
"Start aioReadWorker",
aioReadWorker.start(storageTargets.fds(), rdmabufPool.iovecs()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start dumpWorker", dumpWorker.start(appInfo.nodeId));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start allocateWorker", allocateWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start punchHoleWorker", punchHoleWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start syncMetaKvWorker", syncMetaKvWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start waitRoutingInfo", waitRoutingInfo(appInfo, tpg.bgThreadPool().randomPick()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start resyncWorker", resyncWorker.start());
RETURN_ON_ERROR_LOG_WRAPPED(INFO,
"Start checkWorker",
checkWorker.start(storageTargets.targetPaths(), storageTargets.manufacturers()));
RETURN_ON_ERROR_LOG_WRAPPED(INFO, "Start storageOperator", storageOperator.init(storageTargets.targetPaths().size()));
return Void{};
}
Result<Void> Components::waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor) {
// 1. init mgdtd client.
if (!netClient) {
netClient = std::make_unique<net::Client>(config.client());
RETURN_AND_LOG_ON_ERROR(netClient->start());
}
if (mgmtdClient.load() == nullptr) {
auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<mgmtd::MgmtdServiceStub>>(
hf3fs::stubs::ClientContextCreator{[&](net::Address addr) { return netClient->serdeCtx(addr); }});
mgmtdClient = std::make_shared<hf3fs::client::MgmtdClientForServer>(appInfo.clusterId,
std::move(stubFactory),
config.mgmtd());
}
mgmtdClient.load()->setAppInfoForHeartbeat(appInfo);
mgmtdClient.load()->setConfigListener(ApplicationBase::updateConfig);
// 2. wait target offline.
auto currentMap = targetMap.snapshot();
updateHeartbeatPayload(*currentMap, true);
folly::coro::blockingWait(mgmtdClient.load()->start(&executor));
for (auto sleep = 0;; ++sleep) {
if (sleep) {
XLOGF(WARNING, "Waiting for target offline in routing info...");
std::this_thread::sleep_for(1000_ms);
}
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
auto copy = currentMap->clone();
auto refreshResult = folly::coro::blockingWait(mgmtdClient.load()->refreshRoutingInfo(false));
if (UNLIKELY(!refreshResult)) {
XLOGF(ERR, "refresh routing info error {}", refreshResult.error());
continue;
}
auto result = copy->updateRouting(mgmtdClient.load()->getRoutingInfo(), false);
if (UNLIKELY(!result)) {
XLOGF(ERR, "get and parse routing info error {}", result.error());
continue;
}
bool needWaiting = false;
for (auto &[targetId, target] : copy->getTargets()) {
if (target.publicState == flat::PublicTargetState::SERVING ||
target.publicState == flat::PublicTargetState::SYNCING ||
target.publicState == flat::PublicTargetState::WAITING) {
XLOGF(WARNING, "waiting for chain {} target {}", targetId, serde::toJsonString(target));
needWaiting = true;
break;
}
}
if (!needWaiting) {
break;
}
}
// 3. set listener.
targetMap.setUpdateCallback([this](const TargetMap &map) { updateHeartbeatPayload(map); });
RETURN_AND_LOG_ON_ERROR(refreshRoutingInfo());
folly::coro::blockingWait(mgmtdClient.load()->heartbeat());
XLOGF(INFO, "Initial target map: {}", serde::toJsonString(targetMap.snapshot()->getTargets()));
bool succ = mgmtdClient.load()->addRoutingInfoListener(std::string{kRoutingInfoListenerName},
[this](auto) { refreshRoutingInfo(); });
if (UNLIKELY(!succ)) {
auto msg = fmt::format("node {} addRoutingInfoListener failed!", appInfo.nodeId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
return Void{};
}
Result<Void> Components::refreshRoutingInfo() { return targetMap.updateRouting(mgmtdClient.load()->getRoutingInfo()); }
Result<Void> Components::stopAndJoin(CPUExecutorGroup &executor) {
LOG_COMMAND(INFO, "Stop aioReadWorker", aioReadWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop syncMetaKvWorker", syncMetaKvWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop punchHoleWorker", punchHoleWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop allocateWorker", allocateWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop dumpWorker", dumpWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop checkWorker", checkWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop resyncWorker", resyncWorker.stopAndJoin());
LOG_COMMAND(INFO, "Stop storageOperator", storageOperator.stopAndJoin());
LOG_COMMAND(INFO, "Stop reliableForwarding", reliableForwarding.stopAndJoin());
LOG_COMMAND(INFO, "Stop messenger", messenger.stopAndJoin());
targetMap.setUpdateCallback([](auto) {});
XLOGF(INFO, "Send offline state");
if (auto mgmtd = mgmtdClient.load()) {
mgmtd->removeRoutingInfoListener(kRoutingInfoListenerName);
updateHeartbeatPayload(*targetMap.snapshot(), true);
folly::coro::blockingWait(mgmtd->heartbeat());
}
LOG_COMMAND(INFO, "Stop routingStore", stopMgmtdClient());
LOG_COMMAND(INFO, "Stop readPool", readPool.stopAndJoin());
LOG_COMMAND(INFO, "Stop updatePool", updatePool.stopAndJoin());
LOG_COMMAND(INFO, "Stop syncPool", syncPool.stopAndJoin());
LOG_COMMAND(INFO, "Stop defaultPool", defaultPool.stopAndJoin());
auto snapshot = targetMap.release();
std::vector<std::shared_ptr<StorageTarget>> targets;
for (auto &[targetId, target] : snapshot->getTargets()) {
if (target.storageTarget != nullptr) {
targets.push_back(target.storageTarget);
}
}
LOG_COMMAND(INFO, "Reset target map", snapshot.reset());
XLOGF(WARNING, "start to release {} targets", targets.size());
std::atomic<uint32_t> released{};
std::atomic<uint32_t> synced{};
for (auto &target : targets) {
executor.randomPick().add([&, t = std::move(target)]() mutable {
auto result = t->release();
if (UNLIKELY(!result)) {
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", t->path(), result.error());
} else {
++synced;
}
t = nullptr;
++released;
});
}
for (int i = 0; released != targets.size(); ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for release targets finished...");
std::this_thread::sleep_for(100_ms);
}
XLOGF(WARNING, "released {} targets, synced {} targets", released.load(), synced.load());
LOG_COMMAND(INFO, "Clear storageTargets", storageTargets.globalFileStore().clear(executor));
LOG_COMMAND(INFO, "Clear rdmabufPool", rdmabufPool.clear(executor));
if (config.speed_up_quit()) {
for (auto &engine : storageTargets.engines()) {
engine->speed_up_quit();
}
}
return Void{};
}
Result<Void> Components::stopMgmtdClient() {
if (mgmtdClient.load()) {
folly::coro::blockingWait(mgmtdClient.load()->stop());
}
mgmtdClient.store(nullptr);
if (netClient) {
netClient->stopAndJoin();
}
netClient.reset();
return Void{};
}
Result<robin_hood::unordered_set<std::string>> Components::getActiveClientsList() {
auto result = folly::coro::blockingWait(mgmtdClient.load()->listClientSessions());
RETURN_AND_LOG_ON_ERROR(result);
if (result->bootstrapping) {
auto msg = fmt::format("mgmtd is bootstrapping, skip");
XLOG(WARNING, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
robin_hood::unordered_set<std::string> activeClients;
for (auto &client : result->sessions) {
activeClients.emplace(std::move(client.clientId));
}
return Result<robin_hood::unordered_set<std::string>>(std::move(activeClients));
}
void Components::triggerHeartbeatIfNeed() {
if (triggerHeartbeatFlag.exchange(0)) {
mgmtdClient.load()->triggerHeartbeat();
}
}
void Components::updateHeartbeatPayload(const TargetMap &targetMap, bool offline /* = false */) {
flat::StorageHeartbeatInfo heartbeat;
for (auto &[targetId, target] : targetMap.getTargets()) {
flat::LocalTargetInfo targetInfo;
targetInfo.targetId = targetId;
targetInfo.localState = offline ? flat::LocalTargetState::OFFLINE : target.localState;
targetInfo.diskIndex = target.diskIndex;
targetInfo.lowSpace = target.lowSpace;
monitor::TagSet tag;
tag.addTag("instance", fmt::format("{}", targetId));
targetStateRecorder.set(uint32_t(target.localState), tag);
if (targetInfo.localState != flat::LocalTargetState::OFFLINE) {
targetInfo.usedSize = target.storageTarget->usedSize();
targetInfo.chainVersion = target.vChainId.chainVer;
}
heartbeat.targets.push_back(targetInfo);
}
mgmtdClient.load()->updateHeartbeatPayload(heartbeat);
++triggerHeartbeatFlag;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,124 @@
#pragma once
#include <folly/concurrency/AtomicSharedPtr.h>
#include "client/mgmtd/MgmtdClientForServer.h"
#include "client/storage/StorageMessenger.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/DynamicCoroutinesPool.h"
#include "common/utils/LockManager.h"
#include "common/utils/RobinHood.h"
#include "fbs/storage/Service.h"
#include "storage/aio/AioReadWorker.h"
#include "storage/service/BufferPool.h"
#include "storage/service/StorageOperator.h"
#include "storage/service/TargetMap.h"
#include "storage/store/StorageTargets.h"
#include "storage/sync/ResyncWorker.h"
#include "storage/worker/AllocateWorker.h"
#include "storage/worker/CheckWorker.h"
#include "storage/worker/DumpWorker.h"
#include "storage/worker/PunchHoleWorker.h"
#include "storage/worker/SyncMetaKvWorker.h"
namespace hf3fs::storage {
class ReliableForwarding;
struct Components {
struct Config : public ConfigBase<Config> {
CONFIG_OBJ(base, net::Server::Config, [](net::Server::Config &c) {
c.set_groups_length(2);
c.groups(0).listener().set_listen_port(8000);
c.groups(0).set_network_type(net::Address::RDMA);
c.groups(0).set_services({"StorageSerde"});
c.groups(1).set_network_type(net::Address::TCP);
c.groups(1).listener().set_listen_port(9000);
c.groups(1).set_use_independent_thread_pool(true);
c.groups(1).set_services({"Core"});
c.thread_pool().set_num_io_threads(32);
c.thread_pool().set_num_proc_threads(32);
});
CONFIG_OBJ(client, net::Client::Config);
CONFIG_OBJ(mgmtd, hf3fs::client::MgmtdClientForServer::Config);
CONFIG_OBJ(targets, StorageTargets::Config);
CONFIG_OBJ(storage, StorageOperator::Config);
CONFIG_OBJ(reliable_forwarding, ReliableForwarding::Config);
CONFIG_OBJ(reliable_update, ReliableUpdate::Config);
CONFIG_OBJ(buffer_pool, BufferPool::Config);
CONFIG_OBJ(aio_read_worker, AioReadWorker::Config);
CONFIG_OBJ(sync_worker, ResyncWorker::Config);
CONFIG_OBJ(check_worker, CheckWorker::Config);
CONFIG_OBJ(dump_worker, DumpWorker::Config);
CONFIG_OBJ(allocate_worker, AllocateWorker::Config);
CONFIG_OBJ(sync_meta_kv_worker, SyncMetaKvWorker::Config);
CONFIG_OBJ(forward_client, net::Client::Config);
CONFIG_OBJ(coroutines_pool_read, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_update, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_sync, DynamicCoroutinesPool::Config);
CONFIG_OBJ(coroutines_pool_default, DynamicCoroutinesPool::Config);
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_read, true);
CONFIG_HOT_UPDATED_ITEM(use_coroutines_pool_update, true);
CONFIG_HOT_UPDATED_ITEM(speed_up_quit, true);
};
Components(const Config &config);
Result<Void> start(const flat::AppInfo &appInfo, net::ThreadPoolGroup &tpg);
Result<Void> waitRoutingInfo(const flat::AppInfo &appInfo, folly::CPUThreadPoolExecutor &executor);
Result<Void> refreshRoutingInfo();
Result<Void> stopAndJoin(CPUExecutorGroup &executor);
Result<Void> stopMgmtdClient();
const flat::AppInfo &getAppInfo() const { return appInfo; }
Result<robin_hood::unordered_set<std::string>> getActiveClientsList();
void triggerHeartbeatIfNeed();
inline DynamicCoroutinesPool &getCoroutinesPool(uint16_t methodId) {
if (LIKELY(config.use_coroutines_pool_read()) && methodId == StorageSerde<>::batchReadMethodId) {
return readPool;
}
if (LIKELY(config.use_coroutines_pool_update()) &&
(methodId == StorageSerde<>::writeMethodId || methodId == StorageSerde<>::updateMethodId)) {
return updatePool;
}
if (methodId == StorageSerde<>::syncStartMethodId || methodId == StorageSerde<>::getAllChunkMetadataMethodId) {
return syncPool;
}
return defaultPool;
}
protected:
void updateHeartbeatPayload(const TargetMap &map, bool offline = false);
public:
ConstructLog<"storage::Components"> constructLog_;
const Config &config;
flat::AppInfo appInfo;
std::unique_ptr<net::Client> netClient;
folly::atomic_shared_ptr<hf3fs::client::IMgmtdClientForServer> mgmtdClient;
BufferPool rdmabufPool;
AtomicallyTargetMap targetMap;
StorageTargets storageTargets;
AioReadWorker aioReadWorker;
client::StorageMessenger messenger;
ResyncWorker resyncWorker;
CheckWorker checkWorker;
DumpWorker dumpWorker;
AllocateWorker allocateWorker;
PunchHoleWorker punchHoleWorker;
SyncMetaKvWorker syncMetaKvWorker;
ReliableForwarding reliableForwarding;
DynamicCoroutinesPool readPool;
DynamicCoroutinesPool updatePool;
DynamicCoroutinesPool syncPool;
DynamicCoroutinesPool defaultPool;
StorageOperator storageOperator;
ReliableUpdate reliableUpdate;
std::atomic<uint32_t> triggerHeartbeatFlag{};
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,281 @@
#include "storage/service/ReliableForwarding.h"
#include <folly/experimental/coro/Sleep.h>
#include "common/app/ApplicationBase.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/ExponentialBackoffRetry.h"
#include "fbs/storage/Common.h"
#include "storage/service/Components.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder reliableForwardRecorder("storage.reliable_forward");
monitor::OperationRecorder syncingReadRecorder("storage.syncing_read");
monitor::OperationRecorder updateRemoteRecorder("storage.update_remote");
monitor::CountRecorder forwardWriteBytes("storage.forward.write_bytes");
monitor::DistributionRecorder forwardWriteDist("storage.forward.write_dist");
monitor::CountRecorder forwardSyncingBytes("storage.forward.syncing_bytes");
monitor::DistributionRecorder forwardSyncingDist("storage.forward.syncing_dist");
} // namespace
using namespace std::chrono_literals;
Result<Void> ReliableForwarding::init() { return Void{}; }
Result<Void> ReliableForwarding::stopAndJoin() { return Void{}; }
CoTask<IOResult> ReliableForwarding::forwardWithRetry(ServiceRequestContext &requestCtx,
const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
bool allowOutdatedChainVer /* = true */) {
auto startTime = RelativeTime::now();
auto recordGuard = reliableForwardRecorder.record();
IOResult ioResult;
ExponentialBackoffRetry retry(config_.retry_first_wait().asMs(),
config_.retry_max_wait().asMs(),
config_.retry_total_time().asMs());
for (uint32_t retryCount = 0; !stopped_; ++retryCount) {
auto waitTime = retry.getWaitTime();
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
CO_RETURN_ON_ERROR(targetResult);
target = std::move(*targetResult);
auto ioResult = co_await forward(req, retryCount, rdmabuf, chunkEngineJob, target, commitIO, waitTime);
if (LIKELY(bool(ioResult.lengthInfo))) {
recordGuard.succ();
co_return ioResult;
} else if (ioResult.lengthInfo.error().code() == StorageCode::kNoSuccessorTarget) {
recordGuard.succ();
co_return ioResult;
}
// TODO(SF): fine-grained error handling.
auto code = ioResult.lengthInfo.error().code();
if (!allowOutdatedChainVer && code == StorageClientCode::kRoutingVersionMismatch) {
XLOGF(ERR,
"forwarding routing version mismatch, req {}, result {}, elapsed {}",
req,
ioResult,
(RelativeTime::now() - startTime).asMs());
co_return ioResult;
}
if (waitTime.count() == 0) {
XLOGF_IF(DFATAL,
!requestCtx.debugFlags.faultInjectionEnabled(),
"forwarding timeout with error, req {}, result {}",
req,
ioResult);
co_return ioResult;
} else if (code != RPCCode::kTimeout) {
XLOGF(WARNING,
"forwarding wait and retry, req {}, error {}, elapsed {}",
req,
ioResult,
(RelativeTime::now() - startTime).asMs());
constexpr auto checkInterval = 100ms;
for (auto elapsed = 0ms; elapsed < waitTime && !stopped_; elapsed += checkInterval) {
auto targetResult = components_.targetMap.getByChainId(req.payload.key.vChainId, allowOutdatedChainVer);
CO_RETURN_ON_ERROR(targetResult);
target = std::move(*targetResult);
if (!target->successor.has_value()) {
break;
}
co_await folly::coro::sleep(std::min(checkInterval, waitTime - elapsed));
}
}
}
auto msg = fmt::format("req is refused because of stopping, req {}", req);
XLOG(ERR, msg);
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
}
CoTask<IOResult> ReliableForwarding::forward(const UpdateReq &req,
uint32_t retryCount,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
std::chrono::milliseconds timeout) {
if (!target->successor.has_value()) {
// use the latest chain version.
commitIO.commitChainVer = target->vChainId.chainVer;
co_return makeError(StorageCode::kNoSuccessorTarget);
}
auto ioResult = co_await doForward(req, rdmabuf, chunkEngineJob, retryCount, *target, commitIO.isSyncing, timeout);
if (ioResult.lengthInfo) {
commitIO.commitVer = ioResult.commitVer;
// use successor's chain version.
commitIO.commitChainVer = ioResult.commitChainVer;
if (ioResult.commitChainVer > target->vChainId.chainVer) {
// the remote obtains a higher chain version, and the local need to obtain the latest version by retry.
auto msg = fmt::format("the remote obtains a higher chain version {} > current {}, req {}",
ioResult.commitChainVer,
target->vChainId.chainVer,
req);
XLOGF(WARNING, "{}", msg);
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
}
co_return ioResult;
}
CoTask<IOResult> ReliableForwarding::doForward(const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
uint32_t retryCount,
const Target &target,
bool &isSyncing,
std::chrono::milliseconds timeout) {
UpdateReq updateReq = req;
updateReq.options.fromClient = false;
updateReq.retryCount = retryCount;
updateReq.payload.rdmabuf = rdmabuf;
updateReq.payload.key.vChainId.chainVer = target.vChainId.chainVer;
auto buffer = components_.rdmabufPool.get();
isSyncing = target.successor->targetInfo.publicState == hf3fs::flat::PublicTargetState::SYNCING;
if (isSyncing) {
updateReq.options.isSyncing = true;
updateReq.options.commitChainVer = target.vChainId.chainVer;
}
bool readForSyncing = req.payload.isWriteTruncateExtend() && isSyncing &&
(req.options.isSyncing || req.payload.length != req.payload.chunkSize);
if (readForSyncing) {
auto recordGuard = syncingReadRecorder.record();
// read the entire chunk.
IOResult readResult;
auto allocateResult = buffer.tryAllocate(req.payload.chunkSize);
if (UNLIKELY(!allocateResult)) {
allocateResult = co_await buffer.allocate(req.payload.chunkSize);
}
if (UNLIKELY(!allocateResult)) {
readResult.lengthInfo = makeError(std::move(allocateResult.error()));
co_return readResult;
}
auto &readBuf = *allocateResult;
ReadIO payload;
payload.key = updateReq.payload.key;
payload.offset = 0;
payload.length = req.payload.chunkSize;
BatchReadJob batch(payload, target.storageTarget.get(), readResult, req.payload.checksum.type);
batch.setRecalculateChecksum();
batch.front().state().localbuf = readBuf;
batch.front().state().bufferIndex = buffer.index();
batch.front().state().readUncommitted = true;
if (chunkEngineJob.chunk()) {
batch.front().state().chunkEngineJob.set(nullptr, chunkEngineJob.chunk()->raw_chunk());
}
co_await components_.aioReadWorker.enqueue(&batch);
co_await batch.complete();
CO_RETURN_ON_ERROR(readResult.lengthInfo); // OK.
// clear the inline data if the update is built from full chunk read
if (BITFLAGS_CONTAIN(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE)) {
BITFLAGS_CLEAR(updateReq.featureFlags, FeatureFlags::SEND_DATA_INLINE);
updateReq.payload.inlinebuf.data.clear();
}
auto length = *readResult.lengthInfo;
updateReq.payload.updateVer = readResult.updateVer;
if (req.options.isSyncing) {
updateReq.options.commitChainVer = batch.front().result().commitChainVer;
}
updateReq.payload.offset = 0;
updateReq.payload.length = length;
updateReq.payload.rdmabuf = readBuf.first(length).toRemoteBuf();
updateReq.payload.checksum = batch.front().state().chunkChecksum;
updateReq.payload.updateType = UpdateType::WRITE;
if (length <= config_.max_inline_forward_bytes()) {
updateReq.payload.inlinebuf.data.assign(readBuf.ptr(), readBuf.ptr() + length);
BITFLAGS_SET(updateReq.featureFlags, hf3fs::storage::FeatureFlags::SEND_DATA_INLINE);
}
recordGuard.succ();
} else if (isSyncing && !req.payload.isRemove() && chunkEngineJob.chunk() == nullptr) {
auto chunkResult = target.storageTarget->queryChunk(req.payload.key.chunkId);
if (UNLIKELY(!chunkResult)) {
XLOGF(ERR, "forward query chunk failed, req {}, error {}", updateReq, chunkResult.error());
co_return makeError(std::move(chunkResult.error()));
}
updateReq.payload.updateVer = chunkResult->updateVer;
}
auto recordGuard = updateRemoteRecorder.record();
auto addrResult = target.getSuccessorAddr();
if (UNLIKELY(!addrResult)) {
XLOGF(ERR, "target forward addr invalid, target {}", target);
co_return makeError(std::move(addrResult.error()));
}
net::UserRequestOptions reqOptions;
reqOptions.timeout = Duration{timeout};
auto updateResult = co_await components_.messenger.update(*addrResult, updateReq, &reqOptions);
if (UNLIKELY(!updateResult)) {
XLOGF(ERR, "forward timeout, req {}, result {}", updateReq, updateResult);
co_return makeError(std::move(updateResult.error()));
}
if (LIKELY(bool(updateResult->result.lengthInfo))) {
if (target.vChainId.chainVer < updateResult->result.commitChainVer) {
auto msg = fmt::format("chain version local < remote, req {} local {} remote {}",
updateReq,
target,
updateResult->result);
XLOG(ERR, msg);
co_return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
auto length = *updateResult->result.lengthInfo;
monitor::TagSet tag;
tag.addTag("instance", fmt::format("{}", target.targetId));
if (isSyncing) {
updateResult->result.updateVer = req.payload.updateVer;
forwardSyncingBytes.addSample(length, tag);
forwardSyncingDist.addSample(length, tag);
} else {
forwardWriteBytes.addSample(length, tag);
forwardWriteDist.addSample(length, tag);
}
recordGuard.succ();
} else {
XLOGF(ERR, "forward failed, req {}, result {}", updateReq, updateResult->result);
auto errorCode = updateResult->result.lengthInfo.error().code();
if (errorCode == StorageCode::kChecksumMismatch) {
auto reqChecksum = updateReq.payload.checksum;
auto realChecksum = ChecksumInfo::create(reqChecksum.type,
(const uint8_t *)updateReq.payload.rdmabuf.addr(),
updateReq.payload.length);
if (reqChecksum != realChecksum) {
XLOGF(DFATAL,
"local rdma buffer is corrupted local {} != client {}, req: {}, kill self...",
realChecksum,
reqChecksum,
req);
ApplicationBase::handleSignal(SIGUSR2);
}
}
}
co_return updateResult->result;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,65 @@
#pragma once
#include "client/storage/StorageMessenger.h"
#include "common/net/Client.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/ConfigBase.h"
#include "fbs/storage/Common.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
struct Components;
struct Target;
class ReliableForwarding {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(retry_first_wait, 100_ms);
CONFIG_HOT_UPDATED_ITEM(retry_max_wait, 1000_ms);
CONFIG_HOT_UPDATED_ITEM(retry_total_time, 60_s);
CONFIG_HOT_UPDATED_ITEM(max_inline_forward_bytes, Size{});
};
ReliableForwarding(const Config &config, Components &components)
: config_(config),
components_(components) {}
Result<Void> init();
void beforeStop() { stopped_ = true; }
Result<Void> stopAndJoin();
CoTask<IOResult> forwardWithRetry(ServiceRequestContext &requestCtx,
const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
bool allowOutdatedChainVer = true);
CoTask<IOResult> forward(const UpdateReq &req,
uint32_t retryCount,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
TargetPtr &target,
CommitIO &commitIO,
std::chrono::milliseconds timeout);
CoTask<IOResult> doForward(const UpdateReq &req,
const net::RDMARemoteBuf &rdmabuf,
const ChunkEngineUpdateJob &chunkEngineJob,
uint32_t retryCount,
const Target &target,
bool &isSyncing,
std::chrono::milliseconds timeout);
private:
ConstructLog<"storage::ReliableForwarding"> constructLog_;
const Config &config_;
Components &components_;
std::atomic<bool> stopped_ = false;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,158 @@
#include "storage/service/ReliableUpdate.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "storage/service/Components.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::storage {
monitor::OperationRecorder reliableUpdateRecorder{"storage.reliable_update"};
monitor::CountRecorder reliableUpdateWaited{"storage.reliable_update.waited"};
monitor::CountRecorder reliableUpdateDuplidate{"storage.reliable_update.duplicate"};
monitor::CountRecorder reliableUpdateCached{"storage.reliable_update.cached"};
monitor::OperationRecorder waitChannelLockRecorder{"storage.wait_channel_lock"};
CoTask<IOResult> ReliableUpdate::update(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target) {
XLOGF(DBG1, "Start reliable update, tag: {}, req: {}", req.tag, req);
if (UNLIKELY(stopped_)) {
auto msg = fmt::format("req is refused because of stopping, req {}", req);
XLOG(ERR, msg);
co_return makeError(RPCCode::kRequestRefused, std::move(msg));
}
// 1. check if channel id is valid.
if (req.tag.channel.id == ChannelId{0}) {
XLOGF(DFATAL,
"{} request has invalid message tag {}: {}",
magic_enum::enum_name(req.payload.updateType),
req.tag,
req);
co_return makeError(StorageClientCode::kFoundBug);
}
// 2. get cached.
auto clientId = req.tag.clientId;
auto reqResult = shards_.withLock(
[&](ClientMap &map) {
auto &clientStatus = map[clientId];
if (clientStatus == nullptr) {
clientStatus = std::make_shared<ClientStatus>();
}
auto key = std::pair<ChainId, ChannelId>(req.payload.key.vChainId.chainId, req.tag.channel.id);
auto &reqResult = clientStatus->channelMap[key];
clientStatus->lastUsedTime = UtcClock::now();
return std::shared_ptr<ReqResult>(clientStatus, &reqResult);
},
clientId);
// 3. lock channel.
auto lockRecordGuard = waitChannelLockRecorder.record();
folly::coro::Baton baton;
auto lock = target->storageTarget->tryLockChannel(baton, fmt::format("{}:{}", clientId, req.tag.channel.id));
if (!lock.locked()) {
reliableUpdateWaited.addSample(1);
XLOGF(ERR, "Channel is locked, need retry, tag: {}, req: {}", req.tag, req);
co_return makeError(StorageCode::kChannelIsLocked);
}
lockRecordGuard.report(true);
IOResult updateResult;
if (req.tag.channel.seqnum < reqResult->channelSeqnum) {
reliableUpdateDuplidate.addSample(1);
XLOGF(WARN, "Find a duplicate update, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
co_return makeError(StorageClientCode::kDuplicateUpdate);
}
// 4. return cached result.
if (req.tag.channel.seqnum == reqResult->channelSeqnum &&
target->storageTarget->generationId() == reqResult->generationId) {
if (req.tag.requestId != reqResult->requestId) {
XLOGF(DFATAL,
"[BUG] Message tag {} is already assigned to another update, cached result: {}, req: {}",
req.tag,
*reqResult,
req);
co_return makeError(StorageClientCode::kFoundBug);
}
if (reqResult->updateResult.lengthInfo.hasValue()) {
if (req.payload.updateVer == 0 || req.payload.updateVer == reqResult->updateResult.updateVer) {
updateResult = reqResult->updateResult;
if (*updateResult.lengthInfo != req.payload.length && !req.payload.isExtend()) {
updateResult.lengthInfo = req.payload.length;
XLOGF(WARN,
"Cached length info {} not equal to write size in request {}, fixed update result: {}",
reqResult->updateResult.lengthInfo,
req,
updateResult);
}
reliableUpdateCached.addSample(1);
XLOGF(DBG1, "Return cached update result, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
co_return updateResult;
} else {
XLOGF(CRITICAL,
"Cached update version not equal to request update version, req:{}, cached result: {}",
req,
*reqResult);
}
} else if (req.payload.updateVer == 0 && !target->storageTarget->useChunkEngine() &&
reqResult->succUpdateVer != 0) {
XLOGF(CRITICAL, "Pick up previous update version, tag: {}, cached result: {}, req: {}", req.tag, *reqResult, req);
req.payload.updateVer = reqResult->succUpdateVer;
}
}
// 5. start a new task.
auto recordGuard = reliableUpdateRecorder.record();
updateResult = co_await components_.storageOperator.handleUpdate(requestCtx, req, ibSocket, target);
if (LIKELY(bool(updateResult.lengthInfo))) {
recordGuard.succ();
}
*reqResult = {req.tag.channel.seqnum,
req.tag.requestId,
updateResult,
req.payload.updateVer,
target->storageTarget->generationId()};
XLOGF(DBG1, "Completed reliable update, tag: {}, result: {}", req.tag, *reqResult);
co_return updateResult;
}
Result<Void> ReliableUpdate::cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients) {
if (!config_.clean_up_expired_clients()) {
return Void{};
}
if (activeClients.empty()) {
XLOGF(ERR, "activeClients is empty!");
return Void{};
}
auto allZero = ClientId::zero();
std::size_t cleanUpClientCount = 0;
shards_.iterate([&](ClientMap &map) {
auto now = UtcClock::now();
auto expiredClientsTimeout = config_.expired_clients_timeout();
for (auto it = map.begin(); it != map.end();) {
const auto &[clientId, clientStatus] = *it;
if (!activeClients.contains(clientId.uuid.toHexString()) && clientId != allZero &&
now >= clientStatus->lastUsedTime + expiredClientsTimeout) {
XLOGF(WARNING, "clean up expired client {}, last used time: {}", clientId, clientStatus->lastUsedTime);
it = map.erase(it);
++cleanUpClientCount;
} else {
++it;
}
}
});
XLOGF(WARNING, "clean up {} expired clients", cleanUpClientCount);
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,61 @@
#pragma once
#include "common/net/Transport.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/Duration.h"
#include "common/utils/LockManager.h"
#include "common/utils/RobinHood.h"
#include "common/utils/Shards.h"
#include "common/utils/Size.h"
#include "fbs/storage/Common.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
struct Components;
class StorageOperator;
class ReliableUpdate {
public:
struct Config : ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(clean_up_expired_clients, false);
CONFIG_HOT_UPDATED_ITEM(expired_clients_timeout, 1_h);
};
ReliableUpdate(const Config &config, Components &components)
: config_(config),
components_(components) {}
CoTask<IOResult> update(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target);
Result<Void> cleanUpExpiredClients(const robin_hood::unordered_set<std::string> &activeClients);
void beforeStop() { stopped_ = true; }
private:
ConstructLog<"storage::ReliableUpdate"> constructLog_;
const Config &config_;
Components &components_;
std::atomic<bool> stopped_ = false;
folly::coro::Mutex mutex_;
struct ReqResult {
SERDE_STRUCT_FIELD(channelSeqnum, ChannelSeqNum{0});
SERDE_STRUCT_FIELD(requestId, RequestId{0});
SERDE_STRUCT_FIELD(updateResult, IOResult{});
SERDE_STRUCT_FIELD(succUpdateVer, ChunkVer{});
SERDE_STRUCT_FIELD(generationId, uint32_t{});
};
struct ClientStatus {
std::unordered_map<std::pair<ChainId, ChannelId>, ReqResult> channelMap;
UtcTime lastUsedTime;
};
using ClientMap = std::unordered_map<ClientId, std::shared_ptr<ClientStatus>>;
Shards<ClientMap, 1024> shards_;
};
} // namespace hf3fs::storage

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,161 @@
#pragma once
#include <folly/concurrency/ConcurrentHashMap.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include "analytics/StructuredTraceLog.h"
#include "client/mgmtd/IMgmtdClientForServer.h"
#include "client/mgmtd/RoutingInfo.h"
#include "client/storage/StorageMessenger.h"
#include "common/net/Server.h"
#include "common/net/Transport.h"
#include "common/net/ib/IBSocket.h"
#include "common/net/ib/RDMABuf.h"
#include "common/utils/Address.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Coroutine.h"
#include "common/utils/LockManager.h"
#include "common/utils/Semaphore.h"
#include "storage/aio/AioReadWorker.h"
#include "storage/service/BufferPool.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/ReliableUpdate.h"
#include "storage/store/StorageTargets.h"
#include "storage/update/UpdateWorker.h"
namespace hf3fs::storage {
struct Components;
class StorageOperator {
public:
class Config : public ConfigBase<Config> {
CONFIG_OBJ(write_worker, UpdateWorker::Config);
CONFIG_OBJ(event_trace_log, analytics::StructuredTraceLog<StorageEventTrace>::Config);
CONFIG_HOT_UPDATED_ITEM(max_num_results_per_query, uint32_t{100});
CONFIG_HOT_UPDATED_ITEM(batch_read_job_split_size, uint32_t{1024});
CONFIG_HOT_UPDATED_ITEM(post_buffer_per_bytes, 64_KB);
CONFIG_HOT_UPDATED_ITEM(batch_read_ignore_chain_version, false);
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_writes, 256U);
CONFIG_HOT_UPDATED_ITEM(max_concurrent_rdma_reads, 256U);
CONFIG_HOT_UPDATED_ITEM(read_only, false);
CONFIG_HOT_UPDATED_ITEM(rdma_transmission_req_timeout, 0_ms);
CONFIG_HOT_UPDATED_ITEM(apply_transmission_before_getting_semaphore, true);
};
StorageOperator(const Config &config, Components &components)
: config_(config),
components_(components),
updateWorker_(config_.write_worker()),
storageEventTrace_(config.event_trace_log()) {
for (const auto &ibdev : net::IBDevice::all()) {
concurrentRdmaWriteSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_writes());
concurrentRdmaReadSemaphore_.emplace(ibdev->id(), config.max_concurrent_rdma_reads());
}
onConfigUpdated_ = config_.addCallbackGuard([this]() {
for (auto &[_, semaphore] : concurrentRdmaWriteSemaphore_) {
semaphore.changeUsableTokens(config_.max_concurrent_rdma_writes());
}
for (auto &[_, semaphore] : concurrentRdmaReadSemaphore_) {
semaphore.changeUsableTokens(config_.max_concurrent_rdma_reads());
}
});
}
Result<Void> init(uint32_t numberOfDisks);
Result<Void> stopAndJoin();
CoTryTask<BatchReadRsp> batchRead(ServiceRequestContext &requestCtx,
const BatchReadReq &req,
serde::CallContext &ctx);
CoTryTask<WriteRsp> write(ServiceRequestContext &requestCtx, const WriteReq &req, net::IBSocket *ibSocket);
CoTryTask<UpdateRsp> update(ServiceRequestContext &requestCtx, const UpdateReq &req, net::IBSocket *ibSocket);
CoTryTask<QueryLastChunkRsp> queryLastChunk(ServiceRequestContext &requestCtx, const QueryLastChunkReq &req);
CoTryTask<TruncateChunksRsp> truncateChunks(ServiceRequestContext &requestCtx, const TruncateChunksReq &req);
CoTryTask<RemoveChunksRsp> removeChunks(ServiceRequestContext &requestCtx, const RemoveChunksReq &req);
CoTryTask<TargetSyncInfo> syncStart(const SyncStartReq &req);
CoTryTask<SyncDoneRsp> syncDone(const SyncDoneReq &req);
CoTryTask<SpaceInfoRsp> spaceInfo(const SpaceInfoReq &req);
CoTryTask<CreateTargetRsp> createTarget(const CreateTargetReq &req);
CoTryTask<OfflineTargetRsp> offlineTarget(const OfflineTargetReq &req);
CoTryTask<RemoveTargetRsp> removeTarget(const RemoveTargetReq &req);
CoTryTask<QueryChunkRsp> queryChunk(const QueryChunkReq &req);
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(const GetAllChunkMetadataReq &req);
protected:
using ChunkMetadataProcessor = std::function<CoTryTask<void>(const ChunkId &, const ChunkMetadata &)>;
CoTask<IOResult> handleUpdate(ServiceRequestContext &requestCtx,
UpdateReq &req,
net::IBSocket *ibSocket,
TargetPtr &target);
CoTask<IOResult> doUpdate(ServiceRequestContext &requestCtx,
const UpdateIO &updateIO,
const UpdateOptions &updateOptions,
uint32_t featureFlags,
const std::shared_ptr<StorageTarget> &target,
net::IBSocket *ibSocket,
BufferPool::Buffer &buffer,
net::RDMARemoteBuf &remoteBuf,
ChunkEngineUpdateJob &chunkEngineJob,
bool allowToAllocate);
CoTask<IOResult> doCommit(ServiceRequestContext &requestCtx,
const CommitIO &commitIO,
const UpdateOptions &updateOptions,
ChunkEngineUpdateJob &chunkEngineJob,
uint32_t featureFlags,
const std::shared_ptr<StorageTarget> &target);
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> doQuery(ServiceRequestContext &requestCtx,
const VersionedChainId &vChainId,
const ChunkIdRange &chunkIdRange);
CoTryTask<uint32_t> processQueryResults(ServiceRequestContext &requestCtx,
const VersionedChainId &vChainId,
const ChunkIdRange &chunkIdRanges,
ChunkMetadataProcessor processor,
bool &moreChunksInRange);
CoTask<IOResult> doTruncate(ServiceRequestContext &requestCtx,
const TruncateChunkOp &op,
flat::UserInfo userInfo,
uint32_t featureFlags);
CoTask<IOResult> doRemove(ServiceRequestContext &requestCtx,
const RemoveChunksOp &op,
flat::UserInfo userInfo,
uint32_t featureFlags);
private:
friend class ReliableUpdate;
ConstructLog<"storage::StorageOperator"> constructLog_;
const Config &config_;
Components &components_;
UpdateWorker updateWorker_;
analytics::StructuredTraceLog<StorageEventTrace> storageEventTrace_;
std::unique_ptr<ConfigCallbackGuard> onConfigUpdated_;
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaWriteSemaphore_;
std::map<uint8_t, hf3fs::Semaphore> concurrentRdmaReadSemaphore_;
std::atomic<uint64_t> totalReadBytes_{};
std::atomic<uint64_t> totalReadIOs_{};
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,60 @@
#include "storage/service/StorageServer.h"
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/WithCancellation.h>
#include <folly/logging/xlog.h>
#include "common/kv/mem/MemKVEngine.h"
#include "common/utils/Result.h"
#include "core/service/CoreService.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/StorageService.h"
#include "stubs/common/RealStubFactory.h"
#include "stubs/mgmtd/MgmtdServiceStub.h"
namespace hf3fs::storage {
StorageServer::StorageServer(const Components::Config &config)
: net::Server(config.base()),
components_(config) {}
StorageServer::~StorageServer() {
stopAndJoin();
XLOGF(INFO, "Destructor StorageServer");
}
Result<Void> StorageServer::beforeStart() {
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<StorageService>(components_.storageOperator), true));
RETURN_AND_LOG_ON_ERROR(addSerdeService(std::make_unique<core::CoreService>()));
groups().front()->setCoroutinesPoolGetter([this](const serde::MessagePacket<> &packet) -> DynamicCoroutinesPool & {
switch (packet.serviceId) {
case StorageSerde<>::kServiceID:
return components_.getCoroutinesPool(packet.methodId);
default:
return components_.defaultPool;
}
});
RETURN_AND_LOG_ON_ERROR(components_.start(appInfo(), tpg()));
return Void{};
}
Result<Void> StorageServer::beforeStop() {
components_.reliableUpdate.beforeStop();
components_.reliableForwarding.beforeStop();
return Void{};
}
Result<Void> StorageServer::afterStop() {
RETURN_AND_LOG_ON_ERROR(components_.stopAndJoin(tpg().procThreadPool()));
return Void{};
}
hf3fs::Result<Void> StorageServer::start(const flat::AppInfo &info,
std::unique_ptr<::hf3fs::net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient) {
components_.netClient = std::move(client);
components_.mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForServer>(std::move(mgmtdClient));
return net::Server::start(info);
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,59 @@
#pragma once
#include <folly/CancellationToken.h>
#include "client/mgmtd/MgmtdClientForServer.h"
#include "common/net/Server.h"
#include "core/app/ServerAppConfig.h"
#include "core/app/ServerLauncher.h"
#include "core/app/ServerLauncherConfig.h"
#include "core/app/ServerMgmtdClientFetcher.h"
#include "storage/service/Components.h"
#include "storage/service/ReliableForwarding.h"
#include "storage/service/ReliableUpdate.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::test {
struct StorageServerHelper;
}
namespace hf3fs::storage {
class StorageServer : public net::Server {
public:
static constexpr auto kName = "Storage";
static constexpr auto kNodeType = flat::NodeType::STORAGE;
using AppConfig = core::ServerAppConfig;
struct LauncherConfig : public core::ServerLauncherConfig {
LauncherConfig() { mgmtd_client() = hf3fs::client::MgmtdClientForServer::Config{}; }
};
using RemoteConfigFetcher = core::launcher::ServerMgmtdClientFetcher;
using Launcher = core::ServerLauncher<StorageServer>;
using CommonConfig = ApplicationBase::Config;
using Config = Components::Config;
StorageServer(const Components::Config &config);
~StorageServer() override;
// set up storage server.
Result<Void> beforeStart() final;
// before server stop.
Result<Void> beforeStop() final;
// tear down storage server.
Result<Void> afterStop() final;
using net::Server::start;
hf3fs::Result<Void> start(const flat::AppInfo &info,
std::unique_ptr<::hf3fs::net::Client> client,
std::shared_ptr<::hf3fs::client::MgmtdClient> mgmtdClient);
private:
friend struct test::StorageServerHelper;
ConstructLog<"storage::StorageServer"> constructLog_;
Components components_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,32 @@
#include "storage/service/StorageService.h"
#include "common/monitor/Recorder.h"
namespace hf3fs::storage {
namespace {
monitor::LatencyRecorder readQueueLatency{"storage.read.queue_latency"};
monitor::LatencyRecorder updateQueueLatency{"storage.update.queue_latency"};
monitor::LatencyRecorder defaultQueueLatency{"storage.default.queue_latency"};
} // namespace
void StorageService::reportReadQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
readQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
void StorageService::reportUpdateQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
updateQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
void StorageService::reportDefaultQueueLatency(serde::CallContext &ctx) {
if (ctx.packet().timestamp) {
defaultQueueLatency.addSample(ctx.packet().timestamp->queueLatency());
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,111 @@
#pragma once
#include "common/serde/CallContext.h"
#include "fbs/storage/Service.h"
#include "storage/service/StorageOperator.h"
namespace hf3fs::storage {
class StorageService : public serde::ServiceWrapper<StorageService, storage::StorageSerde> {
public:
StorageService(StorageOperator &storageOperator)
: storageOperator_(storageOperator) {}
CoTryTask<BatchReadRsp> batchRead(serde::CallContext &ctx, const BatchReadReq &req) {
reportReadQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return BatchReadRsp{.tag = req.tag};
ServiceRequestContext requestCtx{"batchRead", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.batchRead(requestCtx, req, ctx);
}
CoTryTask<WriteRsp> write(serde::CallContext &ctx, const WriteReq &req) {
reportUpdateQueueLatency(ctx);
ServiceRequestContext requestCtx{"write", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.write(requestCtx, req, ctx.transport()->ibSocket());
}
CoTryTask<UpdateRsp> update(serde::CallContext &ctx, const UpdateReq &req) {
reportUpdateQueueLatency(ctx);
ServiceRequestContext requestCtx{"update", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.update(requestCtx, req, ctx.transport()->ibSocket());
}
CoTryTask<QueryLastChunkRsp> queryLastChunk(serde::CallContext &ctx, const QueryLastChunkReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return QueryLastChunkRsp{};
ServiceRequestContext requestCtx{"queryLastChunk", req.tag, req.retryCount, req.userInfo, req.debugFlags};
co_return co_await storageOperator_.queryLastChunk(requestCtx, req);
}
CoTryTask<TruncateChunksRsp> truncateChunks(serde::CallContext &ctx, const TruncateChunksReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return TruncateChunksRsp{};
ServiceRequestContext requestCtx{"truncateChunks",
req.payloads.front().tag,
req.payloads.front().retryCount,
req.userInfo,
req.debugFlags};
co_return co_await storageOperator_.truncateChunks(requestCtx, req);
}
CoTryTask<RemoveChunksRsp> removeChunks(serde::CallContext &ctx, const RemoveChunksReq &req) {
reportDefaultQueueLatency(ctx);
if (UNLIKELY(req.payloads.empty())) co_return RemoveChunksRsp{};
ServiceRequestContext requestCtx{"removeChunks",
req.payloads.front().tag,
req.payloads.front().retryCount,
req.userInfo,
req.debugFlags};
co_return co_await storageOperator_.removeChunks(requestCtx, req);
}
CoTryTask<TargetSyncInfo> syncStart(serde::CallContext &ctx, const SyncStartReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.syncStart(req);
}
CoTryTask<SyncDoneRsp> syncDone(serde::CallContext &ctx, const SyncDoneReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.syncDone(req);
}
CoTryTask<SpaceInfoRsp> spaceInfo(serde::CallContext &ctx, const SpaceInfoReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.spaceInfo(req);
}
CoTryTask<CreateTargetRsp> createTarget(serde::CallContext &ctx, const CreateTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.createTarget(req);
}
CoTryTask<OfflineTargetRsp> offlineTarget(serde::CallContext &ctx, const OfflineTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.offlineTarget(req);
}
CoTryTask<RemoveTargetRsp> removeTarget(serde::CallContext &ctx, const RemoveTargetReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.removeTarget(req);
}
CoTryTask<QueryChunkRsp> queryChunk(serde::CallContext &ctx, const QueryChunkReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.queryChunk(req);
}
CoTryTask<GetAllChunkMetadataRsp> getAllChunkMetadata(serde::CallContext &ctx, const GetAllChunkMetadataReq &req) {
reportDefaultQueueLatency(ctx);
return storageOperator_.getAllChunkMetadata(req);
}
private:
void reportReadQueueLatency(serde::CallContext &ctx);
void reportUpdateQueueLatency(serde::CallContext &ctx);
void reportDefaultQueueLatency(serde::CallContext &ctx);
private:
StorageOperator &storageOperator_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,418 @@
#include "storage/service/TargetMap.h"
#include <algorithm>
#include "common/monitor/Recorder.h"
#include "common/utils/RobinHood.h"
#include "fbs/mgmtd/MgmtdTypes.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder updateRoutingRecorder{"storage.update_routing"};
} // namespace
Result<net::Address> Target::getSuccessorAddr() const {
if (UNLIKELY(!successor.has_value())) {
return makeError(StorageCode::kNoSuccessorTarget);
}
auto &serviceGroups = successor->nodeInfo.app.serviceGroups;
if (UNLIKELY(serviceGroups.empty())) {
auto msg = fmt::format("target {} successor service groups is empty", *this);
XLOG(ERR, msg);
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
}
auto &endpoints = serviceGroups.front().endpoints;
if (UNLIKELY(endpoints.empty())) {
auto msg = fmt::format("target {} successor service endpoints is empty", *this);
XLOG(ERR, msg);
return makeError(StorageCode::kNoSuccessorAddr, std::move(msg));
}
return endpoints.front();
}
Result<TargetId> TargetMap::getTargetId(ChainId chainId) const {
auto chainToTargetIt = chainToTarget_.find(chainId);
if (UNLIKELY(chainToTargetIt == chainToTarget_.end())) {
auto msg = fmt::format("chain {} not found", chainId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return chainToTargetIt->second;
}
Result<const Target *> TargetMap::getTarget(TargetId targetId) const {
auto targetsIt = targets_.find(targetId);
if (UNLIKELY(targetsIt == targets_.end())) {
auto msg = fmt::format("target {} not found", targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return &targetsIt->second;
}
Result<const Target *> TargetMap::getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const {
CHECK_RESULT(targetId, getTargetId(vChainId.chainId));
CHECK_RESULT(target, getTarget(targetId));
if (target->vChainId != vChainId && (!allowOutdatedChainVer || vChainId.chainVer > target->vChainId.chainVer)) {
auto msg = fmt::format("chain {} version mismatch request {} != local {}",
vChainId.chainId,
vChainId.chainVer,
target->vChainId.chainVer);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingVersionMismatch, std::move(msg));
}
if (target->localState == flat::LocalTargetState::OFFLINE) {
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kTargetOffline, std::move(msg));
}
if (target->storageTarget == nullptr) {
auto msg = fmt::format("chain {} target {} is offline", vChainId.chainId, target->targetId);
XLOG(CRITICAL, msg);
return makeError(StorageCode::kTargetOffline, std::move(msg));
}
return target;
}
Result<Void> TargetMap::addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget) {
auto targetId = storageTarget->targetId();
Target target;
target.storageTarget = storageTarget;
target.targetId = targetId;
target.chainId = storageTarget->chainId();
target.path = storageTarget->path();
target.localState = flat::LocalTargetState::ONLINE;
target.diskIndex = storageTarget->diskIndex();
target.useChunkEngine = storageTarget->useChunkEngine();
auto [it, succ] = targets_.emplace(targetId, target);
if (UNLIKELY(!succ)) {
if (it->second.localState == flat::LocalTargetState::OFFLINE) {
it->second = std::move(target);
return Void{};
}
auto msg = fmt::format("target {} already exists", targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kTargetStateInvalid, std::move(msg));
}
return Void{};
}
Result<Target *> TargetMap::getMutableTarget(TargetId targetId) {
auto targetsIt = targets_.find(targetId);
if (UNLIKELY(targetsIt == targets_.end())) {
auto msg = fmt::format("target {} not found", targetId);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return &targetsIt->second;
}
Result<Void> TargetMap::syncReceiveDone(VersionedChainId chainId) {
CHECK_RESULT(constTarget, getByChainId(chainId, false));
auto targetId = constTarget->targetId;
CHECK_RESULT(target, getMutableTarget(targetId));
XLOGF(WARNING,
"chain {} target {} sync receive done {} -> UPTODATE",
chainId,
targetId,
magic_enum::enum_name(target->localState));
target->localState = flat::LocalTargetState::UPTODATE;
return Void{};
}
Result<Void> TargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log /* = true */) {
auto recordGuard = updateRoutingRecorder.record();
if (UNLIKELY(r == nullptr)) {
XLOGF(ERR, "routing info is empty");
return makeError(StorageClientCode::kRoutingError, "routing info is empty");
}
auto &routingInfo = r->raw();
if (routingInfoVersion_ > routingInfo->routingInfoVersion) {
auto msg = fmt::format("routing info expired! {} > {}", routingInfoVersion_, routingInfo->routingInfoVersion);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
XLOGF(INFO, "routing info updated, {} -> {}", routingInfoVersion_, routingInfo->routingInfoVersion);
// 1. reset current state.
routingInfoVersion_ = routingInfo->routingInfoVersion;
chainToTarget_.clear();
syncingChains_.clear();
robin_hood::unordered_set<TargetId> headTargets;
robin_hood::unordered_set<TargetId> tailTargets;
robin_hood::unordered_set<TargetId> lastSrvTargets;
for (auto &[targetId, target] : targets_) {
if (target.isHead) {
headTargets.insert(target.targetId);
}
if (target.isTail) {
tailTargets.insert(target.targetId);
}
if (target.publicState == flat::PublicTargetState::LASTSRV) {
lastSrvTargets.insert(target.targetId);
}
target.isHead = false;
target.isTail = false;
target.vChainId = VersionedChainId{};
target.publicState = flat::PublicTargetState::INVALID;
target.successor = std::nullopt;
}
bool invalidRoutingInfo = false;
auto invalidRoutingInfoLogGuard = folly::makeGuard([&] {
if (invalidRoutingInfo) {
XLOGF(CRITICAL, "invalid routing info: {}", *routingInfo);
}
});
// 2. iterate routing info.
for (auto &[id, chain] : routingInfo->chains) {
// 3. find target in chain.
auto it = std::find_if(chain.targets.begin(), chain.targets.end(), [&](const flat::ChainTargetInfo &targetInfo) {
return bool(getMutableTarget(targetInfo.targetId));
});
if (it == chain.targets.end()) {
continue;
}
// 4. find target info.
auto targetId = it->targetId;
auto targetInfo = routingInfo->getTarget(targetId);
if (UNLIKELY(!targetInfo)) {
auto msg = fmt::format("targetInfo id {} not found", targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
// 5. update local target.
CHECK_RESULT(target, getMutableTarget(targetId));
bool targetIsServing = targetInfo->publicState == flat::PublicTargetState::SERVING ||
targetInfo->publicState == flat::PublicTargetState::SYNCING;
auto previousLocalState = target->localState;
target->isHead = (targetIsServing && it == chain.targets.begin());
target->vChainId = VersionedChainId{chain.chainId, chain.chainVersion};
if (target->storageTarget != nullptr) {
if (target->storageTarget->chainId() == ChainId{}) {
RETURN_AND_LOG_ON_ERROR(target->storageTarget->setChainId(chain.chainId));
}
if (target->storageTarget->chainId() != chain.chainId) {
auto msg = fmt::format("target.chain != routing.chain, target {}, chain {}", *target, chain);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
}
target->localState = updateLocalState(targetId, previousLocalState, targetInfo->publicState);
target->publicState = targetInfo->publicState;
auto [chainToTargetIt, succ] = chainToTarget_.emplace(chain.chainId, targetId);
if (!succ) {
auto msg = fmt::format("chain {} map to 2 targets {}, {}", chain.chainId, chainToTargetIt->second, targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
if (previousLocalState != flat::LocalTargetState::OFFLINE &&
target->localState == flat::LocalTargetState::OFFLINE) {
target->weakStorageTarget = target->storageTarget->aliveWeakPtr();
target->storageTarget = nullptr;
continue;
}
// 6. update successor.
while (targetIsServing && ++it != chain.targets.end()) {
auto targetInfo = routingInfo->getTarget(it->targetId);
if (UNLIKELY(!targetInfo)) {
auto msg = fmt::format("successor {} not found", it->targetId);
XLOG(ERR, msg);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
if (targetInfo->publicState == flat::PublicTargetState::SERVING) {
target->successor = Successor{{}, *targetInfo};
} else if (targetInfo->publicState == flat::PublicTargetState::SYNCING) {
target->successor = Successor{{}, *targetInfo};
syncingChains_.push_back(VersionedChainId{chain.chainId, chain.chainVersion});
}
if (target->successor) {
if (!targetInfo->nodeId.has_value()) {
XLOGF(WARNING, "target {} node id is nullopt", it->targetId);
break;
}
auto node = routingInfo->getNode(*targetInfo->nodeId);
if (!node) {
XLOGF(WARNING, "node {} not found", targetInfo->nodeId);
break;
}
target->successor->nodeInfo = *node;
if (UNLIKELY(target->successor->nodeInfo.app.serviceGroups.empty())) {
XLOGF(CRITICAL, "successor invalid! chain {}, successor {}, node {}", chain.chainId, *targetInfo, *node);
invalidRoutingInfo = true;
}
}
break;
}
target->isTail = (targetIsServing && !target->successor.has_value());
if (headTargets.contains(targetId) ^ target->isHead) {
if (target->isHead) {
XLOGF_IF(WARNING, log, "target {} becomes head", targetId);
} else {
XLOGF_IF(WARNING, log, "target {} is no longer head", targetId);
}
}
if (tailTargets.contains(targetId) ^ target->isTail) {
if (target->isTail) {
XLOGF_IF(WARNING, log, "target {} becomes tail", targetId);
} else {
XLOGF_IF(WARNING, log, "target {} is no longer tail", targetId);
}
}
}
for (auto &[targetId, target] : targets_) {
if (lastSrvTargets.contains(targetId) && target.storageTarget &&
(target.publicState == flat::PublicTargetState::SERVING ||
target.publicState == flat::PublicTargetState::SYNCING ||
target.publicState == flat::PublicTargetState::WAITING)) {
target.storageTarget->resetUncommitted(target.vChainId.chainVer);
}
}
recordGuard.succ();
return Void{};
}
Result<Void> TargetMap::removeTarget(TargetId targetId) {
auto succ = targets_.erase(targetId);
if (succ != 1) {
auto msg = fmt::format("target {} not found", targetId);
return makeError(StorageClientCode::kRoutingError, std::move(msg));
}
return Void{};
}
Result<Void> TargetMap::offlineTarget(TargetId targetId) {
CHECK_RESULT(target, getMutableTarget(targetId));
if (target->unrecoverableOffline()) {
return makeError(StorageCode::kTargetOffline, fmt::format("target is already offline, {}.", *target));
}
target->offlineUponUserRequest = true;
target->localState = flat::LocalTargetState::OFFLINE;
return Void{};
}
Result<Void> TargetMap::offlineTargets(const Path &path) {
for (auto &[targetId, target] : targets_) {
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
target.diskError = true;
target.localState = flat::LocalTargetState::OFFLINE;
XLOGF(WARNING, "offline target {} because of disk error", target.path);
}
}
return Void{};
}
Result<Void> TargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
for (auto &[targetId, target] : targets_) {
if (path == target.path.parent_path() && !target.unrecoverableOffline()) {
target.lowSpace = lowSpace;
auto old = std::exchange(target.rejectCreateChunk, rejectCreateChunk);
if (old != rejectCreateChunk) {
XLOGF(WARNING, "target {} reject create chunk {} -> {}", target.path, old, rejectCreateChunk);
}
}
}
return Void{};
}
hf3fs::flat::LocalTargetState TargetMap::updateLocalState(TargetId targetId,
hf3fs::flat::LocalTargetState localState,
hf3fs::flat::PublicTargetState publicState) {
if (localState == hf3fs::flat::LocalTargetState::UPTODATE &&
(publicState == hf3fs::flat::PublicTargetState::OFFLINE ||
publicState == hf3fs::flat::PublicTargetState::LASTSRV ||
publicState == hf3fs::flat::PublicTargetState::WAITING)) {
XLOGF(CRITICAL,
"move to offline state (shutdown), local target: {}, local state: {} -> OFFLINE, public state: {}",
targetId,
magic_enum::enum_name(localState),
magic_enum::enum_name(publicState));
return hf3fs::flat::LocalTargetState::OFFLINE;
} else if (localState == hf3fs::flat::LocalTargetState::ONLINE &&
publicState == hf3fs::flat::PublicTargetState::SERVING) {
XLOGF(INFO,
"move to up-to-date state, local target: {}, local state: {} -> UPTODATE, public state: {}",
targetId,
magic_enum::enum_name(localState),
magic_enum::enum_name(publicState));
return hf3fs::flat::LocalTargetState::UPTODATE;
}
return localState;
}
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByChainId(
VersionedChainId vChainId,
bool allowOutdatedChainVer /* = false */) const {
auto map = snapshot();
auto result = map->getByChainId(vChainId, allowOutdatedChainVer);
RETURN_ON_ERROR(result);
return std::shared_ptr<const Target>(std::move(map), *result);
}
Result<std::shared_ptr<const Target>> AtomicallyTargetMap::getByTargetId(TargetId targetId) const {
auto map = snapshot();
auto result = map->getTarget(targetId);
RETURN_ON_ERROR(result);
return std::shared_ptr<const Target>(std::move(map), *result);
}
Result<Void> AtomicallyTargetMap::updateTargetMap(auto &&updateFunc) {
auto lock = std::unique_lock(mutex_);
auto map = snapshot();
while (true) {
auto newMap = map->clone();
RETURN_AND_LOG_ON_ERROR(updateFunc(newMap));
if (targetMap_.compare_exchange_strong(map, std::move(newMap))) {
break;
}
}
updateCallback_(*snapshot());
return Void{};
};
Result<Void> AtomicallyTargetMap::addStorageTarget(std::shared_ptr<StorageTarget> storageTarget) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->addStorageTarget(storageTarget); });
}
Result<Void> AtomicallyTargetMap::syncReceiveDone(VersionedChainId vChainId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->syncReceiveDone(vChainId); });
}
Result<Void> AtomicallyTargetMap::updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateRouting(r); });
}
Result<Void> AtomicallyTargetMap::removeTarget(TargetId targetId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->removeTarget(targetId); });
}
Result<Void> AtomicallyTargetMap::offlineTarget(TargetId targetId) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTarget(targetId); });
}
Result<Void> AtomicallyTargetMap::offlineTargets(const Path &path) {
return updateTargetMap([&](std::shared_ptr<TargetMap> &newMap) { return newMap->offlineTargets(path); });
}
Result<Void> AtomicallyTargetMap::updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk) {
return updateTargetMap(
[&](std::shared_ptr<TargetMap> &newMap) { return newMap->updateDiskState(path, lowSpace, rejectCreateChunk); });
}
void AtomicallyTargetMap::updateTargetUsedSize() {
auto lock = std::unique_lock(mutex_);
updateCallback_(*snapshot());
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,137 @@
#pragma once
#include <atomic>
#include <common/utils/RobinHood.h>
#include <folly/concurrency/AtomicSharedPtr.h>
#include <memory>
#include "client/mgmtd/RoutingInfo.h"
#include "common/serde/Serde.h"
#include "common/utils/ConstructLog.h"
#include "fbs/mgmtd/MgmtdTypes.h"
#include "fbs/mgmtd/NodeInfo.h"
#include "fbs/mgmtd/TargetInfo.h"
#include "fbs/storage/Common.h"
#include "storage/store/StorageTarget.h"
namespace hf3fs::test {
struct TargetMapHelper;
}
namespace hf3fs::storage {
class TargetMap {
public:
// [observers] clone current map.
std::shared_ptr<TargetMap> clone() const { return std::make_shared<TargetMap>(*this); }
// [observers] get target id by chain id.
Result<TargetId> getTargetId(ChainId chainId) const;
// [observers] get target by target id.
Result<const Target *> getTarget(TargetId targetId) const;
// [observers] get target by versioned chain id.
Result<const Target *> getByChainId(VersionedChainId vChainId, bool allowOutdatedChainVer) const;
// [observers]
auto &getTargets() const { return targets_; }
// [observers]
auto &syncingChains() const { return syncingChains_; }
// [modifiers] add a new target.
Result<Void> addStorageTarget(const std::shared_ptr<StorageTarget> &storageTarget);
// [modifiers] get target by target id.
Result<Target *> getMutableTarget(TargetId targetId);
// [modifiers] sync receive is started.
Result<Void> syncReceiveDone(VersionedChainId vChainId);
// [modifiers] update by routing info.
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r, bool log = true);
// [modifiers] set target as offline.
Result<Void> removeTarget(TargetId targetId);
// [modifiers] set target as offline.
Result<Void> offlineTarget(TargetId targetId);
// [modifiers] set targets in path as offline.
Result<Void> offlineTargets(const Path &path);
// [modifiers] reject create chunk for targets in path.
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
// update local state.
static hf3fs::flat::LocalTargetState updateLocalState(TargetId targetId,
hf3fs::flat::LocalTargetState localState,
hf3fs::flat::PublicTargetState publicState);
private:
friend struct test::TargetMapHelper;
robin_hood::unordered_map<TargetId, Target> targets_;
flat::RoutingInfoVersion routingInfoVersion_;
robin_hood::unordered_map<ChainId, TargetId> chainToTarget_;
std::vector<VersionedChainId> syncingChains_;
};
class AtomicallyTargetMap {
public:
// [observers] get a snapshot of target map.
auto snapshot() const { return targetMap_.load(); }
// [observers] get target by chain id.
Result<std::shared_ptr<const Target>> getByChainId(VersionedChainId vChainId,
bool allowOutdatedChainVer = false) const;
// [observers] get target by its id.
Result<std::shared_ptr<const Target>> getByTargetId(TargetId targetId) const;
// [modifiers] set update callback.
void setUpdateCallback(auto &&func) {
auto lock = std::unique_lock(mutex_);
updateCallback_ = std::forward<decltype(func)>(func);
}
// [modifiers] add a target.
Result<Void> addStorageTarget(std::shared_ptr<StorageTarget> storageTarget);
// [modifiers] sync receive is done.
Result<Void> syncReceiveDone(VersionedChainId vChainId);
// [modifiers] update by routing info.
Result<Void> updateRouting(std::shared_ptr<hf3fs::client::RoutingInfo> r);
// [modifiers] set target as offline.
Result<Void> removeTarget(TargetId targetId);
// [modifiers] set target as offline.
Result<Void> offlineTarget(TargetId targetId);
// [modifiers] set targets in path as offline.
Result<Void> offlineTargets(const Path &path);
// [modifiers] reject create chunk for targets in path.
Result<Void> updateDiskState(const Path &path, bool lowSpace, bool rejectCreateChunk);
// [modifiers] update target used size.
void updateTargetUsedSize();
// [modifiers] release target map.
auto release() { return targetMap_.exchange(nullptr); }
protected:
// [modifiers] update target map atomically.
Result<Void> updateTargetMap(auto &&updateFunc);
private:
friend struct test::TargetMapHelper;
ConstructLog<"storage::AtomicallyTargetMap"> constructLog_;
std::mutex mutex_; // for update operation.
std::function<void(const TargetMap &)> updateCallback_ = [](auto) {};
folly::atomic_shared_ptr<const TargetMap> targetMap_{std::make_shared<const TargetMap>()};
};
} // namespace hf3fs::storage

8
src/storage/storage.cpp Normal file
View File

@@ -0,0 +1,8 @@
#include "common/app/TwoPhaseApplication.h"
#include "memory/common/OverrideCppNewDelete.h"
#include "storage/service/StorageServer.h"
int main(int argc, char *argv[]) {
using namespace hf3fs;
return TwoPhaseApplication<storage::StorageServer>().run(argc, argv);
}

View File

@@ -0,0 +1,112 @@
#include "ChunkEngine.h"
#include "chunk_engine/src/cxx.rs.h"
#include "fbs/storage/Common.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder storageUpdateRecorder{"storage.engine_update"};
monitor::OperationRecorder storageCommitRecorder{"storage.engine_commit"};
} // namespace
Result<uint32_t> ChunkEngine::update(chunk_engine::Engine &engine, UpdateJob &job) {
auto recordGuard = storageUpdateRecorder.record();
// 1. prepare.
const auto &updateIO = job.updateIO();
const auto &chunkId = updateIO.key.chunkId;
const auto &options = job.options();
const auto &state = job.state();
auto &result = job.result();
auto chainId = updateIO.key.vChainId.chainId;
std::string key;
key.reserve(sizeof(chainId) + chunkId.data().size());
key.append((const char *)&chainId, sizeof(chainId));
key.append(chunkId.data());
// 2. start update.
chunk_engine::UpdateReq req{};
if (updateIO.isTruncate()) {
req.is_truncate = true;
} else if (updateIO.isRemove()) {
req.is_remove = true;
}
req.is_syncing = options.isSyncing;
req.update_ver = updateIO.updateVer;
req.chain_ver = job.commitChainVer();
if (updateIO.checksum.type == ChecksumType::CRC32C) {
req.checksum = ~updateIO.checksum.value;
} else if (state.data) {
req.without_checksum = true;
}
if (updateIO.isWrite()) {
req.length = updateIO.length;
req.offset = updateIO.offset;
} else {
req.length = 0;
req.offset = updateIO.length;
}
req.data = reinterpret_cast<uint64_t>(state.data);
req.last_request_id = job.requestCtx().tag.requestId;
auto clientId = job.requestCtx().tag.clientId.uuid.asStringView();
req.last_client_low = *(const uint64_t *)clientId.data();
req.last_client_high = *(const uint64_t *)(clientId.data() + 8);
std::string error{};
auto chunk = engine.update_raw_chunk(toSlice(key), req, error);
result.updateVer = result.commitVer = ChunkVer{req.out_commit_ver};
result.commitChainVer = ChainVer{req.out_chain_ver};
if (req.is_remove && req.out_non_existent) {
result.checksum = ChecksumInfo{ChecksumType::NONE, 0};
} else {
result.checksum = ChecksumInfo{ChecksumType::CRC32C, ~req.out_checksum};
}
if (UNLIKELY(!error.empty())) {
return makeError(req.out_error_code, std::move(error));
}
job.chunkEngineJob().set(engine, chunk);
recordGuard.succ();
if (updateIO.isTruncate() || updateIO.isExtend()) {
return chunk->raw_meta().len;
}
return updateIO.length;
}
Result<uint32_t> ChunkEngine::commit(chunk_engine::Engine &engine, UpdateJob &job, bool sync) {
auto recordGuard = storageCommitRecorder.record();
const auto &commitIO = job.commitIO();
const auto &chunkId = commitIO.key.chunkId;
auto &result = job.result();
auto chainId = commitIO.key.vChainId.chainId;
std::string key;
key.reserve(sizeof(chainId) + chunkId.data().size());
key.append((const char *)&chainId, sizeof(chainId));
key.append(chunkId.data());
auto chunk = job.chunkEngineJob().chunk();
chunk->set_chain_ver(job.commitChainVer());
auto &meta = chunk->raw_meta();
result.updateVer = result.commitVer = ChunkVer{meta.chunk_ver};
result.commitChainVer = ChainVer{meta.chain_ver};
std::string error;
engine.commit_raw_chunk(chunk, sync, error);
job.chunkEngineJob().release();
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataSetError, std::move(error));
}
recordGuard.succ();
return uint32_t{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,267 @@
#pragma once
#include <limits>
#include "chunk_engine/src/cxx.rs.h"
#include "common/utils/UtcTime.h"
#include "fbs/storage/Common.h"
#include "storage/aio/BatchReadJob.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
struct ChunkEngine {
static void copyMeta(const chunk_engine::RawMeta &in, ChunkMetadata &out) {
out.commitVer = ChunkVer{in.chunk_ver};
out.updateVer = ChunkVer{in.chunk_ver};
out.chainVer = ChainVer{in.chain_ver};
out.size = in.len;
out.chunkState = ChunkState::COMMIT;
out.recycleState = RecycleState::NORMAL;
out.checksumType = ChecksumType::CRC32C;
out.checksumValue = ~in.checksum;
out.innerFileId = ChunkFileId{std::max(uint32_t(in.pos >> 48 << 16), 512u * 1024), 256};
out.innerOffset = in.pos;
out.timestamp = UtcTime::fromMicroseconds(in.timestamp);
out.lastRequestId = RequestId{in.last_request_id};
out.lastClientUuid = Uuid::from(in.last_client_low, in.last_client_high);
}
static rust::Slice<const uint8_t> toSlice(const std::string &key) {
return rust::Slice<const uint8_t>{(const uint8_t *)key.data(), key.size()};
}
static Result<Void> aioPrepareRead(chunk_engine::Engine &engine, AioReadJob &job) {
auto &state = job.state();
if (!state.chunkEngineJob.has_chunk()) {
const auto &chunkId = job.readIO().key.chunkId;
auto chainId = job.readIO().key.vChainId.chainId;
std::string key;
key.reserve(sizeof(chainId) + chunkId.data().size());
key.append((const char *)&chainId, sizeof(chainId));
key.append(chunkId.data());
std::string error;
auto chunk = engine.get_raw_chunk(toSlice(key), error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
if (chunk == nullptr) {
return makeError(StorageCode::kChunkMetadataNotFound);
}
state.chunkEngineJob.set(&engine, chunk);
}
auto &result = job.result();
auto &meta = state.chunkEngineJob.chunk()->raw_meta();
result.commitVer = ChunkVer{meta.chunk_ver};
result.updateVer = ChunkVer{meta.chunk_ver};
result.commitChainVer = ChainVer{meta.chain_ver};
state.chunkLen = meta.len;
state.chunkChecksum = ChecksumInfo{ChecksumType::CRC32C, ~meta.checksum};
auto chunkInfo = state.chunkEngineJob.chunk()->fd_and_offset();
state.readLength = job.alignedLength();
state.readFd = chunkInfo.fd;
state.readOffset = chunkInfo.offset + job.alignedOffset();
return Void{};
}
static Result<uint32_t> update(chunk_engine::Engine &engine, UpdateJob &job);
static Result<uint32_t> commit(chunk_engine::Engine &engine, UpdateJob &job, bool sync);
static Result<ChunkMetadata> queryChunk(chunk_engine::Engine &engine, const ChunkId &chunkId, ChainId chainId) {
std::string key;
key.reserve(sizeof(chainId) + chunkId.data().size());
key.append((const char *)&chainId, sizeof(chainId));
key.append(chunkId.data());
std::string error;
auto chunk = engine.get_raw_chunk(toSlice(key), error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
if (chunk == nullptr) {
return makeError(StorageCode::kChunkMetadataNotFound);
}
ChunkMetadata out;
copyMeta(chunk->raw_meta(), out);
engine.release_raw_chunk(chunk);
return out;
}
static Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(chunk_engine::Engine &engine,
const ChunkIdRange &chunkIdRange,
ChainId chainId) {
const auto &beginChunkId = chunkIdRange.begin;
std::string beginKey;
beginKey.reserve(sizeof(chainId) + beginChunkId.data().size());
beginKey.append((const char *)&chainId, sizeof(chainId));
beginKey.append(beginChunkId.data());
const auto &endChunkId = chunkIdRange.end;
std::string endKey;
endKey.reserve(sizeof(chainId) + endChunkId.data().size());
endKey.append((const char *)&chainId, sizeof(chainId));
endKey.append(endChunkId.data());
std::string error;
auto chunks =
engine.query_raw_chunks(toSlice(beginKey), toSlice(endKey), chunkIdRange.maxNumChunkIdsToProcess, error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
auto len = chunks->len();
std::vector<std::pair<ChunkId, ChunkMetadata>> out;
out.reserve(len);
for (size_t i = 0; i < len; ++i) {
auto chunkId = chunks->chunk_id(i);
auto &in = chunks->chunk_meta(i);
out.emplace_back();
out.back().first =
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
copyMeta(in, out.back().second);
}
return out;
}
static Result<std::vector<ChunkId>> queryUncommittedChunks(chunk_engine::Engine &engine, ChainId chainId) {
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
std::string error;
auto chunks = engine.query_uncommitted_raw_chunks(prefix, error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
auto len = chunks->len();
std::vector<ChunkId> out;
out.reserve(len);
for (size_t i = 0; i < len; ++i) {
auto chunkId = chunks->chunk_id(i);
out.push_back(ChunkId(
std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)}));
}
return out;
}
static Result<Void> resetUncommittedChunks(chunk_engine::Engine &engine, ChainId chainId, ChainVer chainVer) {
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
std::string error;
auto chunks = engine.handle_uncommitted_raw_chunks(prefix, chainVer, error);
if (UNLIKELY(!error.empty())) {
XLOGF(CRITICAL, "reset uncommitted chunks failed: {}, chain {}", error, chainId);
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
auto len = chunks->len();
XLOGF_IF(CRITICAL, len > 0, "reset uncommitted chunks succ, chain: {}, size: {}", chainId, len);
for (size_t i = 0; i < len; ++i) {
auto chunkId = chunks->chunk_id(i);
auto id =
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
auto &in = chunks->chunk_meta(i);
ChunkMetadata meta{};
copyMeta(in, meta);
XLOGF(CRITICAL, "reset uncommitted chain {} chunk {} meta {}", chainId, id, meta);
}
return Void{};
}
static Result<Void> removeAllChunks(chunk_engine::Engine &engine, ChainId chainId) {
std::string key;
key.reserve(sizeof(chainId));
key.append((const char *)&chainId, sizeof(chainId));
std::string error;
engine.raw_batch_remove(toSlice(key), toSlice(key), std::numeric_limits<uint64_t>::max(), error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataSetError, std::move(error));
}
return Void{};
}
static Result<Void> getAllMetadata(chunk_engine::Engine &engine, ChainId chainId, ChunkMetaVector &metadataVec) {
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
std::string error;
auto chunks = engine.query_all_raw_chunks(prefix, error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
auto len = chunks->len();
metadataVec.reserve(len);
for (size_t i = 0; i < len; ++i) {
auto chunkId = chunks->chunk_id(i);
auto &in = chunks->chunk_meta(i);
metadataVec.emplace_back();
auto &out = metadataVec.back();
out.chunkId =
ChunkId(std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)});
out.updateVer = ChunkVer{in.chunk_ver};
out.commitVer = ChunkVer{in.chunk_ver};
out.chainVer = ChainVer{in.chain_ver};
out.chunkState = ChunkState::COMMIT;
out.checksum = ChecksumInfo{ChecksumType::CRC32C, ~in.checksum};
out.length = in.len;
if (chunks->chunk_uncommitted(i)) {
out.commitVer = ChunkVer{out.commitVer - 1};
out.chunkState = ChunkState::CLEAN;
}
}
std::sort(metadataVec.begin(), metadataVec.end(), [](auto &a, auto &b) { return a.chunkId > b.chunkId; });
return Void{};
}
static Result<Void> getAllMetadataMap(chunk_engine::Engine &engine,
std::unordered_map<ChunkId, ChunkMetadata> &metas,
ChainId chainId) {
rust::Slice<const uint8_t> prefix{(const uint8_t *)&chainId, sizeof(chainId)};
std::string error;
auto chunks = engine.query_all_raw_chunks(prefix, error);
if (UNLIKELY(!error.empty())) {
return makeError(StorageCode::kChunkMetadataGetError, std::move(error));
}
auto len = chunks->len();
metas.reserve(metas.size() + len);
for (size_t i = 0; i < len; ++i) {
auto chunkId = chunks->chunk_id(i);
auto &meta = metas[ChunkId(
std::string_view{(const char *)chunkId.data() + sizeof(chainId), chunkId.length() - sizeof(chainId)})];
copyMeta(chunks->chunk_meta(i), meta);
if (chunks->chunk_uncommitted(i)) {
meta.commitVer = ChunkVer{meta.commitVer - 1};
meta.chunkState = ChunkState::CLEAN;
}
}
return Void{};
}
static uint64_t chainUsedSize(chunk_engine::Engine &engine, ChainId chainId) {
rust::Slice<const uint8_t> slice{(const uint8_t *)&chainId, sizeof(chainId)};
std::string error;
auto size = engine.query_raw_used_size(slice, error);
if (UNLIKELY(!error.empty())) {
XLOGF(ERR, "query chunk engine chain used size error: chain {} error {}", chainId, error);
}
return size;
}
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,152 @@
#include "storage/store/ChunkFileStore.h"
#include <fcntl.h>
#include <folly/logging/xlog.h>
#include <limits>
#include "common/monitor/Recorder.h"
#include "common/utils/Result.h"
#include "storage/store/ChunkMetadata.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder punchHoleRecorder{"storage.punch_hole"};
monitor::OperationRecorder allocateSpaceRecorder{"storage.allocate_space"};
} // namespace
Result<Void> ChunkFileStore::create(const PhysicalConfig &config) {
path_ = config.path;
physicalFileCount_ = config.physical_file_count;
for (auto &chunkSize : config.chunk_size_list) {
RETURN_AND_LOG_ON_ERROR(createInnerFile(chunkSize));
}
return Void{};
}
Result<Void> ChunkFileStore::load(const PhysicalConfig &config) {
path_ = config.path;
physicalFileCount_ = config.physical_file_count;
for (uint32_t chunkSize : config.chunk_size_list) {
if (config_.preopen_chunk_size_list().contains(chunkSize)) {
for (auto i = 0u; i < physicalFileCount_; ++i) {
RETURN_AND_LOG_ON_ERROR(openInnerFile(ChunkFileId{chunkSize, i}));
}
}
}
return Void{};
}
Result<Void> ChunkFileStore::addChunkSize(const std::vector<Size> &sizeList) {
for (auto &chunkSize : sizeList) {
RETURN_AND_LOG_ON_ERROR(createInnerFile(chunkSize));
XLOGF(WARNING, "chunk inner files are created, path {}, size {}", path_, chunkSize);
}
return Void{};
}
Result<ChunkFileView> ChunkFileStore::open(ChunkFileId fileId) {
auto openResult = openInnerFile(fileId);
if (UNLIKELY(!openResult)) {
XLOGF(ERR, "open file {} failed: {}", fileId, openResult.error());
return makeError(std::move(openResult.error()));
}
auto &innerFile = **openResult;
ChunkFileView file;
file.normal_ = innerFile.normal_;
file.direct_ = innerFile.direct_;
file.index_ = innerFile.index_;
return file;
}
Result<Void> ChunkFileStore::punchHole(ChunkFileId fileId, size_t offset) {
auto recordGuard = punchHoleRecorder.record();
auto openResult = openInnerFile(fileId);
if (UNLIKELY(!openResult)) {
XLOGF(ERR, "open file {} failed: {}", fileId, openResult.error());
return makeError(std::move(openResult.error()));
}
auto &innerFile = **openResult;
int ret = ::fallocate(innerFile.direct_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, fileId.chunkSize);
XLOGF(DBG, "punch hole {}, offset {}", fileId, Size::toString(offset));
if (UNLIKELY(ret == -1)) {
XLOGF(ERR, "punch hole to {} failed: {}", fileId, errno);
return makeError(StorageCode::kPunchHoleFailed, fmt::format("punch hole to {} failed: {}", fileId, errno));
}
recordGuard.succ();
return Void{};
}
Result<Void> ChunkFileStore::allocate(ChunkFileId fileId, size_t offset, size_t size) {
auto recordGuard = allocateSpaceRecorder.record();
auto openResult = openInnerFile(fileId);
RETURN_AND_LOG_ON_ERROR(openResult);
auto &innerFile = **openResult;
int ret = ::fallocate(innerFile.direct_, 0, offset, size);
XLOGF(DBG, "allocate {}, offset {}, size {}", fileId, Size::toString(offset), Size::toString(size));
if (UNLIKELY(ret == -1)) {
auto msg = fmt::format("allocate to {} failed: {}", fileId, errno);
XLOG(ERR, msg);
return makeError(StorageCode::kPunchHoleFailed, std::move(msg));
}
recordGuard.succ();
return Void{};
}
Result<FileDescriptor *> ChunkFileStore::openInnerFile(ChunkFileId fileId, bool createFile /* = false */) {
// 1. try to find in TLS cache.
auto &cache = (*tlsCache_)[fileId];
if (LIKELY(cache != nullptr)) {
return cache;
}
Path filePath = path_ / Size::toString(fileId.chunkSize) / fmt::format("{:02X}", fileId.chunkIdx);
auto openResult = globalFileStore_.open(filePath, createFile);
RETURN_AND_LOG_ON_ERROR(openResult);
cache = *openResult;
return openResult;
}
Result<Void> ChunkFileStore::createInnerFile(Size chunkSize) {
if (chunkSize < kAIOAlignSize) {
auto msg = fmt::format("chunk size too small: {}", chunkSize);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
}
if (chunkSize % kAIOAlignSize) {
auto msg = fmt::format("chunk size not aligned: {}", chunkSize);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
}
if (chunkSize > kMaxChunkSize) {
auto msg = fmt::format("chunk size too large: {}", chunkSize);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkStoreInitFailed, std::move(msg));
}
auto dirPath = path_ / Size::toString(chunkSize);
boost::system::error_code ec{};
boost::filesystem::create_directories(dirPath, ec);
if (UNLIKELY(ec.failed())) {
XLOGF(ERR, "chunk store create directory {} failed: {}", dirPath.string(), ec.message());
return makeError(StorageCode::kChunkOpenFailed,
fmt::format("chunk store create directory {} failed: {}", dirPath.string(), ec.message()));
}
for (auto i = 0u; i < physicalFileCount_; ++i) {
ChunkFileId fileId;
fileId.chunkSize = chunkSize;
fileId.chunkIdx = i;
RETURN_AND_LOG_ON_ERROR(openInnerFile(fileId, true));
}
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,65 @@
#pragma once
#include <folly/ThreadLocal.h>
#include <mutex>
#include "common/utils/ConfigBase.h"
#include "common/utils/FdWrapper.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/RobinHood.h"
#include "common/utils/Shards.h"
#include "storage/store/ChunkFileView.h"
#include "storage/store/ChunkMetadata.h"
#include "storage/store/GlobalFileStore.h"
#include "storage/store/PhysicalConfig.h"
namespace hf3fs::storage {
class ChunkFileStore {
public:
class Config : public ConfigBase<Config> {
CONFIG_ITEM(preopen_chunk_size_list, std::set<Size>{});
};
ChunkFileStore(const Config &config, GlobalFileStore &globalFileStore)
: config_(config),
globalFileStore_(globalFileStore) {}
// create file store.
Result<Void> create(const PhysicalConfig &config);
// load file store.
Result<Void> load(const PhysicalConfig &config);
// add new chunk size.
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
// get a chunk file. [thread-safe]
Result<ChunkFileView> open(ChunkFileId fileId);
// recycle a chunk. [thread-safe]
Result<Void> punchHole(ChunkFileId fileId, size_t offset);
// allocate space. [thread-safe]
Result<Void> allocate(ChunkFileId fileId, size_t offset, size_t size);
protected:
// open inner file. [thread-safe]
Result<FileDescriptor *> openInnerFile(ChunkFileId fileId, bool createFile = false);
// create inner file.
Result<Void> createInnerFile(Size chunkSize);
private:
const Config &config_;
GlobalFileStore &globalFileStore_;
Path path_;
uint32_t physicalFileCount_{};
constexpr static auto kShardsNum = 64u;
folly::ThreadLocal<robin_hood::unordered_map<ChunkFileId, FileDescriptor *>> tlsCache_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,127 @@
#include "storage/store/ChunkFileView.h"
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include <thread>
#include <utility>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/ExponentialBackoffRetry.h"
#include "storage/store/ChunkMetadata.h"
namespace hf3fs::storage {
monitor::OperationRecorder storageReadRecord{"storage.pread"};
monitor::OperationRecorder storageWriteRecord{"storage.pwrite"};
monitor::DistributionRecorder storageWriteSize{"storage.pwrite.size"};
monitor::CountRecorder storageWriteDirect{"storage.pwrite.direct"};
Result<uint32_t> ChunkFileView::read(uint8_t *buf, size_t size, size_t offset, bool direct /* = false */) const {
auto recordGuard = storageReadRecord.record();
int fd = direct ? direct_ : normal_;
uint32_t r = 0;
while (size > 0) {
int ret = ::pread(fd, buf, size, offset);
if (LIKELY(ret > 0)) {
r += ret;
buf += ret;
size -= ret;
offset += ret;
} else if (ret == 0) {
break;
} else {
auto msg = fmt::format("read chunk file failed: fd {}, offset {}, errno {}", fd, offset, errno);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkReadFailed, std::move(msg));
}
}
recordGuard.succ();
return r;
}
Result<uint32_t> ChunkFileView::write(const uint8_t *buf, size_t size, size_t offset, const ChunkMetadata &meta) {
auto recordGuard = storageWriteRecord.record();
storageWriteSize.addSample(size);
if (UNLIKELY(size + offset > meta.innerFileId.chunkSize)) {
auto msg = fmt::format("chunk write exceed chunk size, meta {}, size {}, offset {}", meta, size, offset);
reportFatalEvent();
XLOG(DFATAL, msg);
return makeError(StatusCode::kInvalidArg, std::move(msg));
}
offset += meta.innerOffset;
int fd = normal_;
if (size % kAIOAlignSize == 0 && offset % kAIOAlignSize == 0 &&
reinterpret_cast<uint64_t>(buf) % kAIOAlignSize == 0) {
fd = direct_;
storageWriteDirect.addSample(1);
XLOGF(DBG, "use direct fd for write: fd {}, size {}, offset {}", fd, size, offset);
}
uint32_t w = 0;
ExponentialBackoffRetry retry(100_ms, 5_s, 30_s);
while (size > 0) {
int ret = ::pwrite(fd, buf, size, offset);
if (LIKELY(ret > 0)) {
w += ret;
buf += ret;
size -= ret;
offset += ret;
} else {
auto msg = fmt::format("write chunk file failed: fd {}, direct {}, buf {}, offset {}, size {}, ret {}, errno {}",
fd,
fd == direct_,
fmt::ptr(buf),
offset,
size,
ret,
errno);
XLOG(ERR, msg);
auto waitTime = retry.getWaitTime();
if (waitTime.count() == 0) {
return makeError(StorageCode::kChunkWriteFailed, std::move(msg));
}
std::this_thread::sleep_for(waitTime);
}
}
recordGuard.succ();
return w;
}
Result<ChecksumInfo> ChunkFileView::checksum(ChecksumType type, size_t size, size_t offset, const ChunkMetadata &meta) {
if (UNLIKELY(size + offset > meta.innerFileId.chunkSize)) {
auto msg = fmt::format("chunk write exceed chunk size, meta {}, size {}, offset {}", meta, size, offset);
reportFatalEvent();
XLOG(DFATAL, msg);
return makeError(StatusCode::kInvalidArg, std::move(msg));
}
offset += meta.innerOffset;
ChunkDataIterator iter(*this, size, offset);
auto checksum = ChecksumInfo::create(type, &iter, size);
if (checksum.type == ChecksumType::NONE) return makeError(StorageCode::kChunkReadFailed);
return checksum;
}
std::pair<const uint8_t *, size_t> ChunkDataIterator::next() {
if (length_ == 0) return {nullptr, 0};
size_t readSize = std::min(length_, ChecksumInfo::kChunkSize);
bool directIO = readSize % kAIOAlignSize == 0 && offset_ % kAIOAlignSize == 0;
auto readRes = chunkFile_.read(data_, readSize, offset_, directIO);
if (!readRes) {
XLOGF(ERR, "Cannot calculate checksum since read failed, error: {}", readRes);
return {nullptr, 0};
} else if (*readRes != readSize) {
XLOGF(ERR, "Cannot calculate checksum since read size {} not equal to requested size {}", *readRes, readSize);
return {nullptr, 0};
}
offset_ += readSize;
length_ -= readSize;
return {data_, readSize};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,53 @@
#pragma once
#include <folly/Range.h>
#include "common/utils/Result.h"
#include "fbs/storage/Common.h"
namespace hf3fs::storage {
class ChunkFileView {
public:
// read a piece of data.
Result<uint32_t> read(uint8_t *buf, size_t size, size_t offset, bool direct = false) const;
// write a piece of data.
Result<uint32_t> write(const uint8_t *buf, size_t size, size_t offset, const ChunkMetadata &meta);
// calculate the chunk checksum
Result<ChecksumInfo> checksum(ChecksumType type, size_t size, size_t offset, const ChunkMetadata &meta);
// get direct fd for aio read.
int directFD() const { return direct_; }
// get fd index in list.
auto &index() const { return index_; }
private:
friend class ChunkFileStore;
int normal_;
int direct_;
std::optional<uint32_t> index_{};
};
class ChunkDataIterator : public ChecksumInfo::DataIterator {
public:
ChunkDataIterator(ChunkFileView &chunkFile, size_t length, size_t offset)
: data_((uint8_t *)memory::memalign(kAIOAlignSize, ChecksumInfo::kChunkSize)),
chunkFile_(chunkFile),
length_(length),
offset_(offset) {}
~ChunkDataIterator() override { memory::deallocate(data_); }
std::pair<const uint8_t *, size_t> next() override;
private:
uint8_t *data_;
ChunkFileView &chunkFile_;
size_t length_;
size_t offset_;
};
} // namespace hf3fs::storage

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,161 @@
#pragma once
#include <folly/AtomicUnorderedMap.h>
#include <memory>
#include <queue>
#include "common/utils/ConfigBase.h"
#include "common/utils/Path.h"
#include "common/utils/UtcTime.h"
#include "kv/KVStore.h"
#include "storage/store/ChunkFileStore.h"
#include "storage/store/ChunkMetadata.h"
#include "storage/store/PhysicalConfig.h"
namespace hf3fs::storage {
class ChunkMetaStore {
public:
class Config : public ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(allocate_size, 256_MB, [](Size s) { return s && s % kMaxChunkSize == 0; });
CONFIG_HOT_UPDATED_ITEM(recycle_batch_size, 256u, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(punch_hole_batch_size, 16u, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(removed_chunk_expiration_time, 3_d);
CONFIG_HOT_UPDATED_ITEM(removed_chunk_force_recycled_time, 1_h);
};
ChunkMetaStore(const Config &config, ChunkFileStore &fileStore)
: config_(config),
fileStore_(fileStore),
allocateState_(16) {}
~ChunkMetaStore();
// create chunk meta store.
Result<Void> create(const kv::KVStore::Config &config, const PhysicalConfig &targetConfig);
// load chunk meta store.
Result<Void> load(const kv::KVStore::Config &config,
const PhysicalConfig &targetConfig,
bool createIfMissing = false);
// add new chunk size.
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
// migrate chunk meta store.
Result<Void> migrate(const kv::KVStore::Config &config, const PhysicalConfig &targetConfig);
// get metadata of chunk. [thread-safe]
Result<Void> get(const ChunkId &chunkId, ChunkMetadata &meta);
// set metadata of chunk. [thread-safe]
Result<Void> set(const ChunkId &chunkId, const ChunkMetadata &meta);
// remove metadata of chunk. [thread-safe]
Result<Void> remove(const ChunkId &chunkId, const ChunkMetadata &meta);
// create a chunk. [thread-safe]
Result<Void> createChunk(const ChunkId &chunkId,
ChunkMetadata &meta,
uint32_t chunkSize,
folly::CPUThreadPoolExecutor &executor,
bool allowToAllocate);
// recycle a batch of chunks, return true if has more. [thread-safe]
Result<bool> punchHole();
// sync the LOG of kv.
Result<Void> sync();
// get used size.
uint64_t usedSize() const { return std::max(int64_t(createdSize_.load() - removedSize_.load()), 0l); }
// get reserved and unrecycled size.
Result<Void> unusedSize(int64_t &reservedSize, int64_t &unrecycledSize);
// get all uncommitted chunk ids.
auto &uncommitted() { return uncommitted_; }
// enable or disable emergency recycling.
void setEmergencyRecycling(bool enable) { emergencyRecycling_ = enable; }
// iterator.
class Iterator {
public:
explicit Iterator(kv::KVStore::IteratorPtr it, std::string_view chunkIdPrefix);
// seek a chunk id prefix.
void seek(std::string_view chunkIdPrefix);
// return valid or not.
bool valid() const;
// get current chunk id.
ChunkId chunkId() const;
// get current metadata.
Result<ChunkMetadata> meta() const;
// next metadata.
void next();
// check status.
Result<Void> status() const;
private:
kv::KVStore::IteratorPtr it_;
};
Result<Iterator> iterator(std::string_view chunkIdPrefix = {});
protected:
Result<Void> checkSentinel(std::string_view key);
Result<Void> getSize(std::string_view key, std::atomic<uint64_t> &size);
struct AllocateState {
std::mutex createMutex;
std::mutex recycleMutex;
std::mutex allocateMutex;
std::atomic<bool> loaded{};
std::atomic<bool> allocating{};
std::atomic<bool> recycling{};
uint32_t chunkSize{};
uint32_t allocateIndex{}; // createMutex.
std::atomic<uint64_t> startingPoint{}; // createMutex.
std::atomic<uint64_t> createdCount{}; // createMutex.
std::atomic<uint64_t> usedCount{}; // createMutex.
std::atomic<uint64_t> removedCount{};
std::atomic<uint64_t> recycledCount{}; // recycleMutex
std::atomic<uint64_t> reusedCount{}; // createMutex
std::atomic<uint64_t> holeCount{}; // recycleMutex
std::atomic<UtcTime> oldestRemovedTimestamp{}; // recycleMutex
std::vector<ChunkPosition> createdChunks; // createMutex
std::vector<ChunkPosition> recycledChunks; // createMutex
robin_hood::unordered_map<uint32_t, size_t> fileSize; // createMutex
};
void createAllocateState(uint32_t chunkSize);
Result<AllocateState *> loadAllocateState(uint32_t chunkSize);
Result<Void> allocateChunks(AllocateState &state, bool withLock = false);
bool needRecycleRemovedChunks(AllocateState &state);
Result<Void> recycleRemovedChunks(AllocateState &state, bool withLock = false);
Result<bool> punchHoleRemovedChunks(AllocateState &state, uint64_t expirationUs);
private:
const Config &config_;
ChunkFileStore &fileStore_;
std::unique_ptr<kv::KVStore> kv_;
std::string sentinel_;
std::string kvName_;
bool hasSentinel_ = false;
uint32_t physicalFileCount_ = 256;
std::atomic<uint64_t> createdSize_ = 0;
std::atomic<uint64_t> removedSize_ = 0;
std::vector<ChunkId> uncommitted_;
std::atomic<bool> emergencyRecycling_ = false;
folly::AtomicUnorderedInsertMap<uint32_t, std::unique_ptr<AllocateState>> allocateState_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,14 @@
#include "ChunkMetadata.h"
#include "common/monitor/Recorder.h"
namespace hf3fs::storage {
namespace {
monitor::CountRecorder fatalEvent{"storage.fatal"};
} // namespace
void reportFatalEvent() { fatalEvent.addSample(1); }
} // namespace hf3fs::storage

View File

@@ -0,0 +1,43 @@
#pragma once
#include <atomic>
#include <bit>
#include <cstddef>
#include <cstdint>
#include <folly/Hash.h>
#include <folly/lang/Bits.h>
#include <string>
#include "common/serde/BigEndian.h"
#include "common/serde/Serde.h"
#include "common/utils/Int128.h"
#include "common/utils/Result.h"
#include "common/utils/Size.h"
#include "common/utils/StrongType.h"
#include "fbs/storage/Common.h"
#include "storage/store/ChunkFileView.h"
namespace hf3fs::storage {
inline constexpr auto kMaxChunkSize = 64_MB;
struct ChunkInfo {
ChunkMetadata meta;
ChunkFileView view;
};
struct ChunkPosition {
SERDE_STRUCT_FIELD(fileIdx, uint32_t{});
SERDE_STRUCT_FIELD(offset, serde::BigEndian<std::size_t>{});
};
void reportFatalEvent();
} // namespace hf3fs::storage
template <>
struct ::std::hash<hf3fs::storage::ChunkFileId> {
size_t operator()(hf3fs::storage::ChunkFileId id) const {
return folly::hash::twang_mix64(reinterpret_cast<uint64_t &>(id));
}
};

View File

@@ -0,0 +1,469 @@
#include "storage/store/ChunkReplica.h"
#include <folly/Random.h>
#include <folly/logging/xlog.h>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/FileUtils.h"
#include "common/utils/Result.h"
#include "common/utils/SysResource.h"
#include "fbs/storage/Common.h"
#include "scn/scan/scan.h"
#include "storage/store/ChunkMetadata.h"
namespace hf3fs::storage {
namespace {
std::array<uint8_t, kMaxChunkSize> kZeroBytes{};
monitor::OperationRecorder storageAioReadRecorder{"storage.chunk_read"};
monitor::CountRecorder storageReadUncommitted{"storage.chunk_read.uncommitted"};
monitor::OperationRecorder storageUpdateRecorder{"storage.chunk_update"};
monitor::CountRecorder storageUpdateSyncSmallerVersion{"storage.chunk_update.sync_smaller_version"};
monitor::CountRecorder storageUpdateChecksumReadChunk{"storage.chunk_update.checksum_read_chunk"};
monitor::CountRecorder storageUpdateChecksumCombine{"storage.chunk_update.checksum_combine"};
monitor::CountRecorder storageUpdateChecksumReuse{"storage.chunk_update.checksum_reuse"};
monitor::CountRecorder storageUpdateChecksumNone{"storage.chunk_update.checksum_none"};
monitor::CountRecorder storageUpdateSeqWrite{"storage.chunk_update.seq_write"};
monitor::OperationRecorder storageCommitRecorder{"storage.chunk_commit"};
monitor::CountRecorder storageCommitDirty{"storage.chunk_commit.dirty"};
monitor::CountRecorder storageCommitStale{"storage.chunk_commit.stale"};
} // namespace
// prepare aio read.
Result<Void> ChunkReplica::aioPrepareRead(ChunkStore &store, AioReadJob &job) {
auto recordGuard = storageAioReadRecorder.record();
auto &result = job.result();
auto &state = job.state();
const auto &chunkId = job.readIO().key.chunkId;
// 1. get meta info.
auto metaResult = store.get(chunkId);
if (UNLIKELY(metaResult.hasError())) {
XLOGF(INFO, "{}", metaResult.error());
RETURN_ERROR(metaResult);
}
auto &chunkInfo = (*metaResult)->second;
const ChunkMetadata &meta = chunkInfo.meta;
// 2. check meta info.
result.commitVer = meta.commitVer;
result.updateVer = meta.updateVer;
result.commitChainVer = meta.chainVer;
state.chunkLen = meta.size;
state.chunkChecksum = meta.checksum();
if (UNLIKELY(result.commitVer != result.updateVer && !state.readUncommitted)) {
auto msg = fmt::format("chunk {} {} version mismatch {} != {}", chunkId, meta, result.commitVer, result.updateVer);
XLOG(ERR, msg);
storageReadUncommitted.addSample(1);
return makeError(StorageCode::kChunkNotCommit, std::move(msg));
}
// 3. prepare aio read.
state.readLength = job.alignedLength();
state.readFd = chunkInfo.view.directFD();
state.fdIndex = chunkInfo.view.index();
state.readOffset = meta.innerOffset + job.alignedOffset();
recordGuard.succ();
return Void{};
}
// finish aio read.
Result<Void> ChunkReplica::aioFinishRead(ChunkStore &store, AioReadJob &job) {
auto &readIO = job.readIO();
auto &result = job.result();
const auto &chunkId = readIO.key.chunkId;
// 1. get meta info.
auto metaResult = store.get(chunkId);
if (UNLIKELY(!metaResult)) {
return makeError(std::move(metaResult.error()));
}
auto &chunkInfo = (*metaResult)->second;
const ChunkMetadata &meta = chunkInfo.meta;
// 2. check meta info.
if (UNLIKELY(result.updateVer != meta.updateVer)) {
auto msg = fmt::format("chunk {} {} version outdated {} != {}", chunkId, meta, result.updateVer, meta.updateVer);
XLOG(ERR, msg);
storageReadUncommitted.addSample(1);
return makeError(StorageCode::kChunkNotCommit, std::move(msg));
}
return Void{};
}
static Result<uint32_t> doRealWrite(const ChunkId &chunkId,
ChunkInfo &chunkInfo,
const uint8_t *writeData,
uint32_t writeSize,
uint32_t writeOffset) {
#ifndef NDEBUG
// For debug and unittest.
static auto flagPath = Path{fmt::format("/tmp/storage_main_write_failed.{}", SysResource::pid())};
auto checkResult = loadFile(flagPath);
uint32_t writeErrorPercent = 0;
if (checkResult && scn::scan(*checkResult, "{}", writeErrorPercent)) {
if (folly::Random::rand32(100) < writeErrorPercent) {
auto msg = fmt::format("chunk replica write error for unittest");
XLOG(ERR, msg);
return makeError(StorageCode::kChunkWriteFailed, std::move(msg));
}
}
#endif
ChunkMetadata &meta = chunkInfo.meta;
auto writeResult = chunkInfo.view.write(writeData, writeSize, writeOffset, meta);
if (LIKELY(bool(writeResult))) {
meta.size = std::max(uint32_t(meta.size), writeOffset + writeResult.value());
} else {
XLOGF(ERR, "chunk replica {} {} write error {}", chunkId, meta, writeResult.error());
}
return writeResult;
}
// do write.
Result<uint32_t> ChunkReplica::update(ChunkStore &store, UpdateJob &job, folly::CPUThreadPoolExecutor &executor) {
auto recordGuard = storageUpdateRecorder.record();
const auto &writeIO = job.updateIO();
const auto &options = job.options();
const auto &chunkId = writeIO.key.chunkId;
const auto &state = job.state();
auto &result = job.result();
if (UNLIKELY(!writeIO.isRemove() &&
(writeIO.offset >= writeIO.chunkSize || writeIO.offset + writeIO.length > writeIO.chunkSize))) {
auto msg = fmt::format("chunk {} write offset exceed chunk size {}", chunkId, writeIO);
XLOG(ERR, msg);
return makeError(StatusCode::kInvalidArg, std::move(msg));
}
// 1. get meta info.
ChunkInfo chunkInfo;
bool needCreateChunk = false;
auto metaResult = store.get(chunkId);
if (metaResult) {
chunkInfo = (*metaResult)->second;
} else if (metaResult.error().code() == StorageCode::kChunkMetadataNotFound) {
if (writeIO.isRemove()) {
result.commitVer = result.updateVer = writeIO.updateVer;
result.commitChainVer = job.commitChainVer();
return 0;
} else {
needCreateChunk = true;
chunkInfo.meta.chainVer = job.commitChainVer();
chunkInfo.meta.chunkState = ChunkState::CLEAN;
chunkInfo.meta.innerFileId.chunkSize = writeIO.chunkSize;
}
} else {
RETURN_AND_LOG_ON_ERROR(metaResult);
}
ChunkMetadata &meta = chunkInfo.meta;
// 2. begin to write.
auto chunkSize = writeIO.isRemove() ? meta.innerFileId.chunkSize : writeIO.chunkSize;
result.commitVer = meta.commitVer;
result.updateVer = meta.updateVer;
result.checksum = meta.checksum();
result.commitChainVer = meta.chainVer;
if (UNLIKELY(meta.innerFileId.chunkSize != chunkSize)) {
auto msg = fmt::format("chunk {} {} chunk size mismatch {}", chunkId, meta, chunkSize);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkSizeMismatch, std::move(msg));
}
if (UNLIKELY(meta.chunkState == ChunkState::DIRTY && !options.isSyncing)) {
auto msg = fmt::format("chunk {} {} state not valid", chunkId, meta);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkNotClean, fmt::format("chunk {} {} state not valid", chunkId, meta));
}
if (job.commitChainVer() < meta.chainVer && meta.chunkState == ChunkState::COMMIT) {
auto msg = fmt::format("chunk {} {} chain version mismatch {} {}", chunkId, meta, writeIO, options);
reportFatalEvent();
XLOG(DFATAL, msg);
return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
if (writeIO.checksum.type != ChecksumType::NONE && writeIO.length != 0) {
auto checksum = ChecksumInfo::create(writeIO.checksum.type, state.data, writeIO.length);
if (checksum != writeIO.checksum) {
if (!job.requestCtx().debugFlags.faultInjectionEnabled()) {
reportFatalEvent();
}
XLOGF_IF(DFATAL,
!job.requestCtx().debugFlags.faultInjectionEnabled(),
"Local checksum {} not equal to checksum {} generated by client, write io: {}",
checksum,
writeIO.checksum,
writeIO);
return makeError(StorageCode::kChecksumMismatch);
}
}
XLOGF(DBG, "chunk {} {} write begin", chunkId, meta);
if (options.isSyncing) {
XLOGF(DBG9, "chunk {} {} sync write: {}", chunkId, meta, writeIO);
meta.updateVer = writeIO.updateVer;
meta.commitVer = ChunkVer{writeIO.updateVer - 1};
meta.recycleState = RecycleState::NORMAL;
} else if (writeIO.updateVer > 0) {
if (writeIO.updateVer <= meta.commitVer) {
auto msg = fmt::format("chunk {} {} committed update {} <= {}", chunkId, meta, writeIO.updateVer, meta.commitVer);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkCommittedUpdate, std::move(msg));
} else if (writeIO.updateVer <= meta.updateVer) {
auto msg = fmt::format("chunk {} {} stale update {} <= {}", chunkId, meta, writeIO.updateVer, meta.updateVer);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkStaleUpdate, std::move(msg));
} else if (writeIO.updateVer > meta.updateVer + 1) {
auto msg =
fmt::format("chunk {} {} missing update {} > {} + 1", chunkId, meta, writeIO.updateVer, meta.updateVer);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkMissingUpdate, std::move(msg));
}
meta.updateVer = writeIO.updateVer;
} else {
meta.updateVer += 1;
if (meta.updateVer > meta.commitVer + 1) {
auto msg = fmt::format("chunk {} {} advance update {}", chunkId, meta, writeIO);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkAdvanceUpdate, std::move(msg));
}
}
meta.chunkState = ChunkState::DIRTY;
meta.chainVer = job.commitChainVer();
meta.lastRequestId = job.requestCtx().tag.requestId;
meta.lastClientUuid = job.requestCtx().tag.clientId.uuid;
meta.timestamp = UtcClock::now();
const bool isAppendWrite = writeIO.offset == meta.size;
const bool skipPersist = (writeIO.isWrite() && isAppendWrite) || writeIO.isTruncate() || writeIO.isExtend();
auto setResult = needCreateChunk ? store.createChunk(chunkId, chunkSize, chunkInfo, executor, job.allowToAllocate())
: store.set(chunkId, chunkInfo, !skipPersist);
if (UNLIKELY(!setResult)) {
return makeError(std::move(setResult.error()));
}
result.commitChainVer = meta.chainVer;
result.updateVer = meta.updateVer;
uint32_t chunkSizeBeforeWrite = meta.size;
// 3. do write operation.
Result<uint32_t> writeResult = 0;
if (writeIO.isTruncate() || writeIO.isExtend()) {
if (writeIO.length <= meta.size) {
if (writeIO.isTruncate()) {
writeResult = (meta.size = writeIO.length);
} else {
writeResult = meta.size;
}
} else {
// extend the chunk (fill zeros)
writeResult = doRealWrite(chunkId, chunkInfo, kZeroBytes.data(), writeIO.length - meta.size, meta.size);
if (writeResult) {
writeResult = meta.size; // set result to the actual chunk length if write succeeds
}
}
} else if (writeIO.isRemove()) {
// remove.
if (!meta.readyToRemove()) {
meta.recycleState = RecycleState::REMOVAL_IN_PROGRESS;
}
writeResult = 0;
} else {
// fill zeros before the write range if there is a gap
if (meta.size < writeIO.offset) {
RETURN_AND_LOG_ON_ERROR(chunkInfo.view.write(kZeroBytes.data(), writeIO.offset - meta.size, meta.size, meta));
}
// normal write.
writeResult = doRealWrite(chunkId, chunkInfo, state.data, writeIO.length, writeIO.offset);
if (writeResult) {
if (options.isSyncing) meta.size = writeIO.length;
storageUpdateSeqWrite.addSample(writeIO.offset == chunkSizeBeforeWrite);
}
}
if (UNLIKELY(!writeResult)) {
return writeResult; // chunk becomes dirty.
}
// update chunk checksum
auto checksumRes = updateChecksum(chunkInfo, writeIO, chunkSizeBeforeWrite, isAppendWrite);
if (UNLIKELY(!checksumRes)) {
return makeError(std::move(checksumRes.error()));
}
// 4. finish to write.
meta.chunkState = ChunkState::CLEAN;
XLOGF(DBG, "chunk {} {} write finish", chunkId, meta);
setResult = store.set(chunkId, chunkInfo, !skipPersist);
if (UNLIKELY(!setResult)) {
return makeError(std::move(setResult.error()));
}
result.checksum = meta.checksum();
result.commitVer = meta.commitVer;
result.commitChainVer = meta.chainVer;
recordGuard.succ();
return writeResult;
}
Result<Void> ChunkReplica::updateChecksum(ChunkInfo &chunkInfo,
UpdateIO writeIO,
uint32_t chunkSizeBeforeWrite,
bool isAppendWrite) {
const auto &chunkId = writeIO.key.chunkId;
ChunkMetadata &meta = chunkInfo.meta;
auto chunkChecksum = meta.checksum();
bool combineChecksum = chunkSizeBeforeWrite > 0 && isAppendWrite;
if (writeIO.isTruncate() || writeIO.isExtend()) {
writeIO.checksum = ChecksumInfo::create(meta.checksumType, (const uint8_t *)nullptr, 0);
writeIO.offset = meta.size;
writeIO.length = 0;
}
if (writeIO.checksum.type == ChecksumType::NONE || meta.size == 0) {
meta.checksumValue = 0;
storageUpdateChecksumNone.addSample(1);
} else if (writeIO.offset == 0 && writeIO.length == meta.size) {
meta.checksumValue = writeIO.checksum.value;
storageUpdateChecksumReuse.addSample(1);
} else if (writeIO.checksum.type == chunkChecksum.type && combineChecksum) {
// combine the chunk checksum and write io checksum if this write appends to existing chunk
auto combinResult = chunkChecksum.combine(writeIO.checksum, writeIO.length);
if (UNLIKELY(!combinResult)) {
XLOGF(ERR,
"Failed to combine checksums: error {}, chunkId {}, meta {}, write io: {}",
combinResult.error(),
chunkId,
meta,
writeIO);
return makeError(combinResult.error());
}
meta.checksumValue = chunkChecksum.value;
storageUpdateChecksumCombine.addSample(1);
} else {
// read the prefix of chunk and compute its checksum
auto prefixChecksum = chunkInfo.view.checksum(writeIO.checksum.type, writeIO.offset, 0, meta);
if (UNLIKELY(!prefixChecksum)) {
XLOGF(ERR,
"Failed to calculate chunk prefix checksum: error {}, chunkId {}, meta {}, write io: {}",
prefixChecksum.error(),
chunkId,
meta,
writeIO);
return makeError(std::move(prefixChecksum.error()));
}
// read the suffix of chunk and compute its checksum
uint32_t suffixStart = std::min(writeIO.offset + writeIO.length, meta.size);
uint32_t suffixLength = meta.size - suffixStart;
auto suffixChecksum = chunkInfo.view.checksum(writeIO.checksum.type, suffixLength, suffixStart, meta);
if (UNLIKELY(!suffixChecksum)) {
XLOGF(ERR,
"Failed to calculate chunk suffix checksum: error {}, chunkId {}, meta {}, write io: {}",
suffixChecksum.error(),
chunkId,
meta,
writeIO);
return makeError(std::move(suffixChecksum.error()));
}
prefixChecksum->combine(writeIO.checksum, writeIO.length);
prefixChecksum->combine(*suffixChecksum, suffixLength);
meta.checksumValue = prefixChecksum->value;
storageUpdateChecksumReadChunk.addSample(1);
}
meta.checksumType = writeIO.checksum.type;
return Void{};
}
// commit the version of this chunk.
Result<uint32_t> ChunkReplica::commit(ChunkStore &store, UpdateJob &job) {
auto recordGuard = storageCommitRecorder.record();
auto &commitIO = job.commitIO();
auto &chunkId = commitIO.key.chunkId;
auto &result = job.result();
// 1. get meta info.
auto getResult = store.get(chunkId);
if (commitIO.isRemove && !getResult && getResult.error().code() == StorageCode::kChunkMetadataNotFound) {
result.commitVer = result.updateVer = commitIO.commitVer;
result.commitChainVer = commitIO.commitChainVer;
return 0;
}
RETURN_AND_LOG_ON_ERROR(getResult);
auto chunkInfo = (*getResult)->second;
ChunkMetadata &meta = chunkInfo.meta;
result.commitVer = meta.commitVer;
result.updateVer = meta.updateVer;
result.commitChainVer = meta.chainVer;
if (job.commitChainVer() < meta.chainVer) {
auto msg = fmt::format("chunk {} {} chain version mismatch {}", chunkId, meta, commitIO);
reportFatalEvent();
XLOG(DFATAL, msg);
return makeError(StorageCode::kChainVersionMismatch, std::move(msg));
}
if (commitIO.commitVer > meta.updateVer) {
auto msg = fmt::format("chunk {} meta {} commit version mismatch", chunkId, meta);
reportFatalEvent();
XLOG(DFATAL, msg);
return makeError(StorageCode::kChunkVersionMismatch, std::move(msg));
}
if (commitIO.isForce) {
meta.chunkState = ChunkState::CLEAN;
meta.commitVer = commitIO.commitVer;
} else if (meta.chunkState == ChunkState::DIRTY) {
auto msg = fmt::format("chunk {} is dirty {}", chunkId, meta);
XLOG(ERR, msg);
storageCommitDirty.addSample(1);
return makeError(StorageCode::kChunkNotClean, std::move(msg));
} else if (meta.commitVer < commitIO.commitVer) {
meta.commitVer = commitIO.commitVer;
} else {
auto msg = fmt::format("chunk {} stale commit {} > {}", chunkId, meta.commitVer, commitIO.commitVer);
XLOG(ERR, msg);
storageCommitStale.addSample(1);
result.commitVer = meta.commitVer;
result.commitChainVer = meta.chainVer;
return makeError(StorageCode::kChunkStaleCommit, std::move(msg));
}
if (meta.commitVer == meta.updateVer) {
meta.chunkState = ChunkState::COMMIT;
meta.chainVer = job.commitChainVer();
}
meta.lastRequestId = job.requestCtx().tag.requestId;
meta.lastClientUuid = job.requestCtx().tag.clientId.uuid;
meta.timestamp = UtcClock::now();
auto metaResult = meta.readyToRemove() ? store.remove(chunkId, chunkInfo) : store.set(chunkId, chunkInfo);
if (UNLIKELY(!metaResult)) {
return makeError(std::move(metaResult.error()));
}
result.commitVer = meta.commitVer;
result.commitChainVer = meta.chainVer;
recordGuard.succ();
return 0;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,33 @@
#pragma once
#include <folly/Range.h>
#include "common/utils/Result.h"
#include "storage/aio/BatchReadJob.h"
#include "storage/store/ChunkMetadata.h"
#include "storage/store/ChunkStore.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
class ChunkReplica {
public:
// prepare aio read.
static Result<Void> aioPrepareRead(ChunkStore &store, AioReadJob &job);
// finish aio read.
static Result<Void> aioFinishRead(ChunkStore &store, AioReadJob &job);
// do write.
static Result<uint32_t> update(ChunkStore &store, UpdateJob &job, folly::CPUThreadPoolExecutor &executor);
static Result<Void> updateChecksum(ChunkInfo &chunkInfo,
UpdateIO writeIO,
uint32_t chunkSizeBeforeWrite,
bool isAppendWrite);
// commit the version of this chunk.
static Result<uint32_t> commit(ChunkStore &store, UpdateJob &job);
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,216 @@
#include "storage/store/ChunkStore.h"
#include <chrono>
#include <fcntl.h>
#include <folly/Hash.h>
#include <folly/ScopeGuard.h>
#include <folly/logging/xlog.h>
#include "common/monitor/Recorder.h"
#include "common/utils/Result.h"
#include "storage/store/ChunkMetadata.h"
#include "storage/store/ChunkReplica.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
monitor::OperationRecorder chunkStoreCreateRecorder{"storage.chunk_store_create"};
monitor::OperationRecorder chunkStoreSetRecorder{"storage.chunk_store_set"};
monitor::CountRecorder chunkStoreSetWriteDownRecorder{"storage.chunk_store_set.write_down"};
monitor::CountRecorder chunkStoreSetWriteSkipRecorder{"storage.chunk_store_set.write_skip"};
monitor::OperationRecorder listAllChunkIdsRecorder{"storage.list_all_chunks"};
monitor::DistributionRecorder chunkCountRecorder{"storage.list_all_chunks.chunk_count"};
monitor::DistributionRecorder uncommittedRecorder{"storage.list_all_chunks.uncommitted_count"};
monitor::OperationRecorder queryChunksRecorder{"storage.query_chunks"};
monitor::OperationRecorder resetUncommittedRecorder{"storage.reset_uncommitted"};
// initialize chunk store.
Result<Void> ChunkStore::create(const PhysicalConfig &config) {
RETURN_AND_LOG_ON_ERROR(fileStore_.create(config));
RETURN_AND_LOG_ON_ERROR(metaStore_.create(config_.kv_store(), config));
targetId_ = TargetId{config.target_id};
tag_ = {{"instance", std::to_string(targetId_)}};
return Void{};
}
// load chunk store.
Result<Void> ChunkStore::load(const PhysicalConfig &config) {
RETURN_AND_LOG_ON_ERROR(fileStore_.load(config));
RETURN_AND_LOG_ON_ERROR(metaStore_.load(config_.kv_store(), config));
targetId_ = TargetId{config.target_id};
tag_ = {{"instance", std::to_string(targetId_)}};
return Void{};
}
// add new chunk size.
Result<Void> ChunkStore::addChunkSize(const std::vector<Size> &sizeList) {
RETURN_AND_LOG_ON_ERROR(fileStore_.addChunkSize(sizeList));
RETURN_AND_LOG_ON_ERROR(metaStore_.addChunkSize(sizeList));
return Void{};
}
Result<ChunkStore::Map::ConstIterator> ChunkStore::get(const ChunkId &chunkId) {
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
// 1. find in cache.
auto it = map_.find(chunkId);
if (it != map_.end()) {
return Result<Map::ConstIterator>(std::move(it));
}
// 2. load from DB.
ChunkInfo chunkInfo;
auto metaResult = metaStore_.get(chunkId, chunkInfo.meta);
if (metaResult) {
auto openResult = fileStore_.open(chunkInfo.meta.innerFileId);
RETURN_AND_LOG_ON_ERROR(openResult);
chunkInfo.view = *openResult;
auto [it, succ] = map_.emplace(chunkId, chunkInfo);
return Result<Map::ConstIterator>(std::move(it));
}
return makeError(std::move(metaResult.error()));
}
Result<Void> ChunkStore::createChunk(const ChunkId &chunkId,
uint32_t chunkSize,
ChunkInfo &chunkInfo,
folly::CPUThreadPoolExecutor &executor,
bool allowToAllocate) {
auto recordGuard = chunkStoreCreateRecorder.record();
auto metaResult = metaStore_.createChunk(chunkId, chunkInfo.meta, chunkSize, executor, allowToAllocate);
RETURN_AND_LOG_ON_ERROR(metaResult);
auto openResult = fileStore_.open(chunkInfo.meta.innerFileId);
RETURN_AND_LOG_ON_ERROR(openResult);
chunkInfo.view = *openResult;
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
map_.insert_or_assign(chunkId, chunkInfo);
recordGuard.succ();
return Void{};
}
Result<Void> ChunkStore::set(const ChunkId &chunkId, const ChunkInfo &chunkInfo, bool persist /* = true */) {
auto recordGuard = chunkStoreSetRecorder.record();
if (persist || config_.force_persist()) {
chunkStoreSetWriteDownRecorder.addSample(1);
auto metaResult = metaStore_.set(chunkId, chunkInfo.meta);
RETURN_AND_LOG_ON_ERROR(metaResult);
} else {
chunkStoreSetWriteSkipRecorder.addSample(1);
}
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
map_.insert_or_assign(chunkId, chunkInfo);
recordGuard.succ();
return Void{};
}
Result<Void> ChunkStore::remove(ChunkId chunkId, ChunkInfo &chunkInfo) {
if (UNLIKELY(!chunkInfo.meta.readyToRemove())) {
return makeError(StorageCode::kChunkNotReadyToRemove);
}
XLOGF(DBG, "ready to remove: {}", chunkInfo.meta);
auto getResult = get(chunkId);
RETURN_AND_LOG_ON_ERROR(getResult);
auto &map_ = maps_[std::hash<ChunkId>{}(chunkId) % kShardsNum];
RETURN_AND_LOG_ON_ERROR(metaStore_.remove(chunkId, chunkInfo.meta));
map_.erase(*getResult);
return Void{};
}
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> ChunkStore::queryChunks(const ChunkIdRange &chunkIdRange) {
auto recordGuard = queryChunksRecorder.record(tag_);
std::vector<std::pair<ChunkId, ChunkMetadata>> chunkIds;
auto it = metaStore_.iterator(chunkIdRange.end.data());
RETURN_AND_LOG_ON_ERROR(it);
for (; it->valid() && chunkIds.size() < chunkIdRange.maxNumChunkIdsToProcess; it->next()) {
auto chunkId = it->chunkId();
auto metadata = it->meta();
if (chunkId == chunkIdRange.end) { // [begin, end)
continue;
}
if (chunkId < chunkIdRange.begin) {
break;
}
RETURN_AND_LOG_ON_ERROR(metadata);
chunkIds.emplace_back(chunkId, *metadata);
}
RETURN_AND_LOG_ON_ERROR(it->status());
recordGuard.succ();
return chunkIds;
}
Result<Void> ChunkStore::getAllMetadata(ChunkMetaVector &metas) {
auto recordGuard = listAllChunkIdsRecorder.record(tag_);
auto it = metaStore_.iterator();
RETURN_AND_LOG_ON_ERROR(it);
for (; it->valid(); it->next()) {
auto chunkId = it->chunkId();
auto metaResult = it->meta();
if (UNLIKELY(!metaResult)) {
XLOGF(ERR, "chunk {} parse meta failed {}", chunkId, metaResult.error());
return makeError(std::move(metaResult.error()));
}
auto &meta = *metaResult;
metas.emplace_back();
metas.back().chunkId = std::move(chunkId);
metas.back().updateVer = meta.updateVer;
metas.back().commitVer = meta.commitVer;
metas.back().chainVer = meta.chainVer;
metas.back().chunkState = meta.chunkState;
metas.back().checksum = meta.checksum();
metas.back().length = meta.size;
}
RETURN_AND_LOG_ON_ERROR(it->status());
recordGuard.succ();
chunkCountRecorder.addSample(metas.size(), tag_);
return Void{};
}
Result<Void> ChunkStore::resetUncommitted(ChainVer chainVer) {
auto &uncommitted = metaStore_.uncommitted();
if (uncommitted.empty()) {
return Void{};
}
XLOGF(CRITICAL, "reset uncommitted chunks, size: {}", uncommitted.size());
for (auto &chunkId : uncommitted) {
auto recordGuard = resetUncommittedRecorder.record();
auto getResult = get(chunkId);
if (!getResult) {
XLOGF(ERR, "reset uncommitted chunk {} not found", chunkId);
continue;
}
auto chunkInfo = (*getResult)->second;
XLOGF(CRITICAL, "reset uncommitted chunk {} meta {}", chunkId, chunkInfo.meta);
CommitIO commitIO;
commitIO.key.chunkId = chunkId;
commitIO.commitVer = chunkInfo.meta.updateVer;
commitIO.isForce = true;
commitIO.commitChainVer = chainVer;
ServiceRequestContext requestCtx{"commit", MessageTag(ClientId{Uuid::max()}, {})};
ChunkEngineUpdateJob updateChunk{};
UpdateJob updateJob(requestCtx, commitIO, {}, updateChunk, nullptr);
auto commitResult = ChunkReplica::commit(*this, updateJob);
if (!commitResult) {
XLOGF(ERR, "reset uncommitted chunk {} set failed: {}", chunkId, commitResult.error());
continue;
}
recordGuard.succ();
}
uncommitted.clear();
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,117 @@
#pragma once
#include <folly/concurrency/ConcurrentHashMap.h>
#include <optional>
#include <vector>
#include "common/monitor/Recorder.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/FdWrapper.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/RobinHood.h"
#include "storage/store/ChunkFileStore.h"
#include "storage/store/ChunkMetaStore.h"
#include "storage/store/ChunkMetadata.h"
namespace hf3fs::storage {
enum class PointQueryStrategy {
NONE,
CLASSIC,
MODERN,
};
class ChunkStore {
public:
class Config : public ConfigBase<Config> {
CONFIG_OBJ(kv_store, kv::KVStore::Config);
CONFIG_OBJ(file_store, ChunkFileStore::Config);
CONFIG_OBJ(meta_store, ChunkMetaStore::Config);
CONFIG_ITEM(mutex_num, 257u, ConfigCheckers::isPositivePrime<uint32_t>);
CONFIG_ITEM(kv_path, Path{});
CONFIG_HOT_UPDATED_ITEM(migrate_kv_store, false);
CONFIG_HOT_UPDATED_ITEM(force_persist, true);
CONFIG_HOT_UPDATED_ITEM(point_query_strategy, PointQueryStrategy::NONE);
};
using Map = folly::ConcurrentHashMap<ChunkId, ChunkInfo>;
ChunkStore(const Config &config, GlobalFileStore &globalFileStore)
: config_(config),
fileStore_(config_.file_store(), globalFileStore),
metaStore_(config_.meta_store(), fileStore_) {}
ChunkMetaStore &chunkMetaStore() { return metaStore_; }
// create chunk store.
Result<Void> create(const PhysicalConfig &config);
// load chunk store.
Result<Void> load(const PhysicalConfig &config);
// add new chunk size.
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
// migrate meta store.
Result<Void> migrate(const PhysicalConfig &config) { return metaStore_.migrate(config_.kv_store(), config); }
// get meta of a chunk file.
Result<Map::ConstIterator> get(const ChunkId &chunkId);
// create a new chunk file.
Result<Void> createChunk(const ChunkId &chunkId,
uint32_t chunkSize,
ChunkInfo &chunkInfo,
folly::CPUThreadPoolExecutor &executor,
bool allowToAllocate);
// set meta of a chunk file.
Result<Void> set(const ChunkId &chunkId, const ChunkInfo &chunkInfo, bool persist = true);
// remove a chunk file.
Result<Void> remove(ChunkId chunkId, ChunkInfo &chunkInfo);
// recycle a batch of chunks.
Result<bool> punchHole() { return metaStore_.punchHole(); }
// sync meta kv.
Result<Void> sync() { return metaStore_.sync(); }
// query chunks: the chunk ids in result are in reverse lexicographical order
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(const ChunkIdRange &chunkIdRange);
// list all chunk ids.
Result<Void> getAllMetadata(ChunkMetaVector &metas);
// get meta iterator.
auto metaIterator() { return metaStore_.iterator(); }
// get used size.
uint64_t usedSize() const { return metaStore_.usedSize(); }
// get reserved and unrecycled size.
Result<Void> unusedSize(int64_t &reservedSize, int64_t &unrecycledSize) {
return metaStore_.unusedSize(reservedSize, unrecycledSize);
}
// get all uncommitted chunk ids.
const auto &uncommitted() { return metaStore_.uncommitted(); }
// reset uncommitted chunk to committed state.
Result<Void> resetUncommitted(ChainVer chainVer);
// enable or disable emergency recycling.
void setEmergencyRecycling(bool enable) { return metaStore_.setEmergencyRecycling(enable); }
private:
const Config &config_;
ChunkFileStore fileStore_;
ChunkMetaStore metaStore_;
TargetId targetId_;
monitor::TagSet tag_;
static constexpr auto kShardsNum = 32u;
std::array<Map, kShardsNum> maps_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,77 @@
#include "storage/store/GlobalFileStore.h"
#include <fcntl.h>
#include <folly/logging/xlog.h>
#include "common/utils/Duration.h"
#include "common/utils/Size.h"
namespace hf3fs::storage {
Result<FileDescriptor *> GlobalFileStore::open(const Path &filePath, bool createFile /* = false */) {
return shards_.withLock(
[&](FdMap &map) -> Result<FileDescriptor *> {
auto &innerFile = map[filePath];
if (innerFile.normal_.valid()) {
return &innerFile;
}
// 3. open file.
FileDescriptor file;
{
// open in normal mode.
auto flags = O_RDWR | O_SYNC;
int ret = createFile ? ::open(filePath.c_str(), O_CREAT | flags, 0644) : ::open(filePath.c_str(), flags);
if (UNLIKELY(ret == -1)) {
auto msg = fmt::format("chunk store open file {} failed: errno {}", filePath, errno);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
}
file.normal_ = ret;
}
{
// open in direct mode.
auto flags = O_RDWR | O_DIRECT;
int ret = ::open(filePath.c_str(), flags);
if (UNLIKELY(ret == -1)) {
auto msg = fmt::format("chunk store open file {} failed: errno {}", filePath, errno);
XLOG(ERR, msg);
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
}
file.direct_ = ret;
}
innerFile = std::move(file);
return &innerFile;
},
filePath);
}
void GlobalFileStore::collect(std::vector<int> &fds) {
fds.clear();
fds.reserve(128_KB);
shards_.iterate([&](FdMap &map) {
for (auto &[path, fd] : map) {
fd.index_ = fds.size();
fds.push_back(fd.direct_);
}
});
}
Result<Void> GlobalFileStore::clear(CPUExecutorGroup &executor) {
std::atomic<uint32_t> finished = 0;
shards_.iterate([&](FdMap &map) {
executor.pickNext().add([&, m = std::move(map)]() mutable {
m.clear();
++finished;
});
});
for (int i = 0; finished != kShardsNum; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for clear fd finished...");
std::this_thread::sleep_for(50_ms);
}
return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,36 @@
#pragma once
#include <folly/ThreadLocal.h>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <mutex>
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/FdWrapper.h"
#include "common/utils/Path.h"
#include "common/utils/Result.h"
#include "common/utils/Shards.h"
namespace hf3fs::storage {
struct FileDescriptor {
FdWrapper normal_;
FdWrapper direct_;
std::optional<uint32_t> index_{};
};
class GlobalFileStore {
public:
Result<FileDescriptor *> open(const Path &filePath, bool createFile = false);
void collect(std::vector<int> &fds);
Result<Void> clear(CPUExecutorGroup &executor);
private:
constexpr static auto kShardsNum = 256u;
using FdMap = std::unordered_map<Path, FileDescriptor>;
Shards<FdMap, kShardsNum> shards_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,38 @@
#pragma once
#include "common/serde/Serde.h"
#include "common/utils/Path.h"
#include "common/utils/Size.h"
#include "kv/KVStore.h"
namespace hf3fs::storage {
// Physical configuration of the storage target. Store in `target.toml`.
static inline constexpr auto kPhysicalConfigFileName = "target.toml";
class PhysicalConfig {
SERDE_STRUCT_FIELD(path, Path{});
SERDE_STRUCT_FIELD(target_id, uint64_t{});
SERDE_STRUCT_FIELD(block_device_uuid, std::string{});
SERDE_STRUCT_FIELD(allow_disk_without_uuid, false);
SERDE_STRUCT_FIELD(allow_existing_targets, false);
SERDE_STRUCT_FIELD(physical_file_count, 256u);
SERDE_STRUCT_FIELD(chunk_size_list, (std::vector<Size>{512_KB, 1_MB, 2_MB, 4_MB, 16_MB, 64_MB}));
SERDE_STRUCT_FIELD(chain_id, uint32_t{});
SERDE_STRUCT_FIELD(kv_store_type, kv::KVStore::Type::LevelDB);
SERDE_STRUCT_FIELD(has_sentinel, false);
SERDE_STRUCT_FIELD(kv_store_name, std::string{"meta"});
SERDE_STRUCT_FIELD(kv_path, std::optional<Path>{});
SERDE_STRUCT_FIELD(only_chunk_engine, false);
public:
Path kvPath() const {
if (kv_path.has_value()) {
return *kv_path / fmt::format("{}_{}", kv_store_name, target_id);
}
return path / kv_store_name;
}
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,445 @@
#include "storage/store/StorageTarget.h"
#include <boost/filesystem/operations.hpp>
#include <folly/experimental/symbolizer/Symbolizer.h>
#include <sys/stat.h>
#include "common/monitor/Recorder.h"
#include "common/serde/Serde.h"
#include "common/utils/MagicEnum.hpp"
#include "common/utils/Result.h"
#include "common/utils/SysResource.h"
#include "storage/store/ChunkEngine.h"
#include "storage/store/ChunkReplica.h"
namespace hf3fs::storage {
namespace {
monitor::CountRecorder storageUpdateChecksumMismatch{"storage.chunk_update.checksum_mismatch"};
monitor::CountRecorder storageUpdateReplace{"storage.chunk_update.replace"};
monitor::CountRecorder storageUpdateCommitted{"storage.chunk_update.committed"};
monitor::CountRecorder storageUpdateStale{"storage.chunk_update.stale"};
monitor::CountRecorder storageUpdateMissing{"storage.chunk_update.missing"};
monitor::CountRecorder storageUpdateAdvance{"storage.chunk_update.advance"};
monitor::CountRecorder storageWriteTimes{"storage.chunk_write.times"};
monitor::CountRecorder storageRemoveTimes{"storage.chunk_remove.times"};
monitor::CountRecorder storageTruncateTimes{"storage.chunk_truncate.times"};
monitor::CountRecorder aioReadCountPerDisk{"storage.aio_read.count_per_disk"};
monitor::CountRecorder aioReadBytesPerDisk{"storage.aio_read.bytes_per_disk"};
monitor::CountRecorder aioReadSuccBytesPerDisk{"storage.aio_read.succ_bytes_per_disk"};
monitor::LatencyRecorder aioReadSuccLatencyPerDisk{"storage.aio_read.succ_latency_per_disk"};
monitor::ValueRecorder targetUsedSize{"storage.target.used_size", std::nullopt, false};
monitor::ValueRecorder targetReservedSize{"storage.target.reserved_size", std::nullopt, false};
monitor::ValueRecorder targetUnrecycledSize{"storage.target.unrecycled_size", std::nullopt, false};
monitor::OperationRecorder pointQueryRecorder{"storage.point_query"};
std::atomic<uint32_t> gGenerationId{};
Result<std::string> getDeviceUUID(const Path &path) {
struct stat st;
int ret = ::stat(path.c_str(), &st);
if (ret != 0) {
auto msg = fmt::format("stat {} failed: {}", path.string(), errno);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageStatFailed, std::move(msg));
}
auto getDeviceUUIDResult = SysResource::fileSystemUUID();
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
if (!getDeviceUUIDResult->count(st.st_dev)) {
auto msg = fmt::format("Not found UUID for path {} device {}", path.string(), st.st_dev);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageUUIDMismatch, std::move(msg));
}
return getDeviceUUIDResult->at(st.st_dev);
}
} // namespace
StorageTarget::StorageTarget(const Config &config,
GlobalFileStore &globalFileStore,
uint32_t diskIndex,
chunk_engine::Engine *engine)
: config_(config),
diskIndex_(diskIndex),
generationId_(++gGenerationId),
engine_(engine),
diskTag_(monitor::instanceTagSet(std::to_string(diskIndex))),
targetTag_(monitor::instanceTagSet(std::to_string(0))),
readCountPerDisk_(aioReadCountPerDisk.getRecoderWithTag(diskTag_)),
readBytesPerDisk_(aioReadBytesPerDisk.getRecoderWithTag(diskTag_)),
readSuccBytesPerDisk_(aioReadSuccBytesPerDisk.getRecoderWithTag(diskTag_)),
readSuccLatencyPerDisk_(aioReadSuccLatencyPerDisk.getRecoderWithTag(diskTag_)),
targetUsedSize_(targetUsedSize.getRecoderWithTag(targetTag_)),
targetReservedSize_(targetReservedSize.getRecoderWithTag(targetTag_)),
targetUnrecycledSize_(targetUnrecycledSize.getRecoderWithTag(targetTag_)),
chunkStore_(config_, globalFileStore) {}
StorageTarget::~StorageTarget() {
if (released_) {
return;
}
auto result = sync();
if (UNLIKELY(!result)) {
XLOGF(CRITICAL, "storage target sync meta failed {}, error: {}", targetConfig_.path, result.error());
}
}
Result<Void> StorageTarget::create(const PhysicalConfig &config) {
Path targetConfigFilePath = config.path / kPhysicalConfigFileName;
if (boost::filesystem::exists(targetConfigFilePath)) {
auto msg = fmt::format("Target config file {} already exists", targetConfigFilePath.string());
XLOG(INFO, msg);
if (!config.allow_existing_targets) {
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
RETURN_AND_LOG_ON_ERROR(load(config.path));
if (targetConfig_.target_id != config.target_id) {
auto msg = fmt::format("target id is different: {} != {}", targetConfig_.target_id, config.target_id);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (targetConfig_.physical_file_count != config.physical_file_count) {
auto msg = fmt::format("Physical file count is different: {} != {}",
targetConfig_.physical_file_count,
config.physical_file_count);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
RETURN_AND_LOG_ON_ERROR(addChunkSize(config.chunk_size_list));
XLOGF(INFO, "Target config file {} check passed", targetConfigFilePath.string());
return Void{};
}
targetConfig_ = config;
targetConfig_.has_sentinel = true;
auto kvPath = config_.kv_path();
if (kvPath.empty()) {
targetConfig_.kv_path = std::nullopt;
} else {
targetConfig_.kv_path = kvPath;
}
if (useChunkEngine()) {
boost::system::error_code ec{};
boost::filesystem::create_directories(targetConfig_.path, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("target create directory {} failed: {}", targetConfig_.path.string(), ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
}
} else {
RETURN_AND_LOG_ON_ERROR(chunkStore_.create(targetConfig_));
}
auto getDeviceUUIDResult = getDeviceUUID(targetConfig_.path);
if (getDeviceUUIDResult) {
targetConfig_.block_device_uuid = *getDeviceUUIDResult;
} else if (targetConfig_.allow_disk_without_uuid) {
targetConfig_.block_device_uuid = "";
} else {
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
}
std::ofstream targetConfigFile(targetConfigFilePath, std::ios::out);
if (!targetConfigFile) {
auto msg = fmt::format("Open target config file {} failed", targetConfigFilePath.string());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (!(targetConfigFile << serde::toTomlString(targetConfig_))) {
auto msg = fmt::format("Write target config file {} failed", targetConfigFilePath.string());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
*chunkSizeList_.lock() = {targetConfig_.chunk_size_list.begin(), targetConfig_.chunk_size_list.end()};
targetTag_ = monitor::instanceTagSet(std::to_string(targetConfig_.target_id));
targetUsedSize_ = targetUsedSize.getRecoderWithTag(targetTag_);
targetReservedSize_ = targetReservedSize.getRecoderWithTag(targetTag_);
targetUnrecycledSize_ = targetUnrecycledSize.getRecoderWithTag(targetTag_);
return Void{};
}
Result<Void> StorageTarget::load(const Path &path) {
RETURN_AND_LOG_ON_ERROR(serde::fromTomlFile(targetConfig_, path / kPhysicalConfigFileName));
if (path != targetConfig_.path) {
auto msg = fmt::format("Path config mismatch {} != real {}", targetConfig_.path, path);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
auto getDeviceUUIDResult = getDeviceUUID(path);
if (!getDeviceUUIDResult) {
if (targetConfig_.allow_disk_without_uuid) {
getDeviceUUIDResult = "";
} else {
RETURN_AND_LOG_ON_ERROR(getDeviceUUIDResult);
}
}
if (targetConfig_.block_device_uuid != *getDeviceUUIDResult) {
auto msg = fmt::format("UUID mismatch config {} != real {}", targetConfig_.block_device_uuid, *getDeviceUUIDResult);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageUUIDMismatch, std::move(msg));
}
if (!targetConfig_.only_chunk_engine) {
RETURN_AND_LOG_ON_ERROR(chunkStore_.load(targetConfig_));
}
if (!targetConfig_.only_chunk_engine && config_.migrate_kv_store() &&
config_.kv_store().type() != targetConfig_.kv_store_type) {
XLOGF(WARNING, "start migrate kv {} -> {}", targetConfig_, magic_enum::enum_name(config_.kv_store().type()));
targetConfig_.kv_store_name = "kv";
targetConfig_.kv_store_type = config_.kv_store().type();
targetConfig_.has_sentinel = true;
RETURN_AND_LOG_ON_ERROR(chunkStore_.migrate(targetConfig_));
Path targetConfigFilePath = path / kPhysicalConfigFileName;
std::ofstream targetConfigFile(targetConfigFilePath, std::ios::out);
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(targetConfig_))) {
auto msg = fmt::format("Write target config file {} failed", targetConfigFilePath.string());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
XLOGF(WARNING, "finish migrate kv {} -> {}", targetConfig_, magic_enum::enum_name(config_.kv_store().type()));
}
*chunkSizeList_.lock() = {targetConfig_.chunk_size_list.begin(), targetConfig_.chunk_size_list.end()};
targetTag_ = monitor::instanceTagSet(std::to_string(targetConfig_.target_id));
targetUsedSize_ = targetUsedSize.getRecoderWithTag(targetTag_);
targetReservedSize_ = targetReservedSize.getRecoderWithTag(targetTag_);
targetUnrecycledSize_ = targetUnrecycledSize.getRecoderWithTag(targetTag_);
return Void{};
}
Result<Void> StorageTarget::addChunkSize(const std::vector<Size> &sizeList) {
if (useChunkEngine()) {
return Void{};
}
auto chunkSizeListGuard = chunkSizeList_.lock();
std::vector<Size> newSizeList;
for (auto size : sizeList) {
if (!chunkSizeListGuard->contains(size)) {
newSizeList.push_back(size);
}
}
if (newSizeList.empty()) {
return Void{};
}
RETURN_AND_LOG_ON_ERROR(chunkStore_.addChunkSize(newSizeList));
for (auto size : *chunkSizeListGuard) {
newSizeList.push_back(size);
}
std::sort(newSizeList.begin(), newSizeList.end());
auto newTargetConfig = targetConfig_;
newTargetConfig.chunk_size_list = newSizeList;
Path tempPath = newTargetConfig.path / fmt::format("{}.tmp", kPhysicalConfigFileName);
std::ofstream targetConfigFile(tempPath, std::ios::out);
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(newTargetConfig))) {
auto msg = fmt::format("Write target config file {} failed", tempPath.string());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
Path targetPath = newTargetConfig.path / kPhysicalConfigFileName;
boost::system::error_code ec;
boost::filesystem::rename(tempPath, targetPath, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("Re-write target config file {} failed, error: {}", targetPath.string(), ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
for (auto size : sizeList) {
chunkSizeListGuard->insert(size);
}
return Void{};
}
Result<Void> StorageTarget::setChainId(ChainId chainId) {
auto chunkSizeListGuard = chunkSizeList_.lock();
if (targetConfig_.chain_id != 0) {
return Void{};
}
auto newTargetConfig = targetConfig_;
newTargetConfig.chain_id = chainId;
Path tempPath = newTargetConfig.path / fmt::format("{}.tmp", kPhysicalConfigFileName);
std::ofstream targetConfigFile(tempPath, std::ios::out);
if (!targetConfigFile || !(targetConfigFile << serde::toTomlString(newTargetConfig))) {
auto msg = fmt::format("Write target config file {} failed", tempPath.string());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
Path targetPath = newTargetConfig.path / kPhysicalConfigFileName;
boost::system::error_code ec;
boost::filesystem::rename(tempPath, targetPath, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("Re-write target config file {} failed, error: {}", targetPath.string(), ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
targetConfig_.chain_id = chainId;
return Void{};
}
// prepare aio read.
Result<Void> StorageTarget::aioPrepareRead(AioReadJob &job) {
readCountPerDisk_->addSample(1);
readBytesPerDisk_->addSample(job.alignedLength());
if (useChunkEngine()) {
return ChunkEngine::aioPrepareRead(*engine_, job);
} else {
return ChunkReplica::aioPrepareRead(chunkStore_, job);
}
}
Result<Void> StorageTarget::aioFinishRead(AioReadJob &job) {
if (job.state().chunkEngineJob.has_chunk()) {
return Void{};
}
return ChunkReplica::aioFinishRead(chunkStore_, job);
}
// update chunk (write/remove/truncate).
void StorageTarget::updateChunk(UpdateJob &job, folly::CPUThreadPoolExecutor &executor) {
if (job.type() == UpdateType::COMMIT) {
if (useChunkEngine()) {
job.setResult(ChunkEngine::commit(*engine_, job, config_.kv_store().sync_when_write()));
} else {
job.setResult(ChunkReplica::commit(chunkStore_, job));
}
} else {
auto result =
useChunkEngine() ? ChunkEngine::update(*engine_, job) : ChunkReplica::update(chunkStore_, job, executor);
if (LIKELY(result.hasValue())) {
if (job.options().isSyncing) {
storageUpdateReplace.addSample(1);
}
if (job.updateIO().isWrite()) {
storageWriteTimes.addSample(1);
} else if (job.updateIO().isRemove()) {
storageRemoveTimes.addSample(1);
} else if (job.updateIO().isExtend()) {
storageTruncateTimes.addSample(1);
}
} else {
uint32_t code = result.error().code();
switch (code) {
case StorageCode::kChecksumMismatch:
storageUpdateChecksumMismatch.addSample(1);
break;
case StorageCode::kChunkCommittedUpdate:
storageUpdateCommitted.addSample(1);
break;
case StorageCode::kChunkStaleUpdate:
storageUpdateStale.addSample(1);
break;
case StorageCode::kChunkMissingUpdate:
storageUpdateMissing.addSample(1);
break;
case StorageCode::kChunkAdvanceUpdate:
storageUpdateAdvance.addSample(1);
break;
default:
break;
}
}
job.setResult(std::move(result));
}
}
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> StorageTarget::queryChunks(const ChunkIdRange &chunkIdRange) {
auto pointQueryStrategy = config_.point_query_strategy();
if ((pointQueryStrategy == PointQueryStrategy::CLASSIC && chunkIdRange.begin.nextChunkId() == chunkIdRange.end) ||
(pointQueryStrategy == PointQueryStrategy::MODERN &&
chunkIdRange.begin.rangeEndForCurrentChunk() == chunkIdRange.end)) {
auto reportGuard = pointQueryRecorder.record();
auto result = queryChunk(chunkIdRange.begin);
if (result.hasValue()) {
reportGuard.succ();
return std::vector<std::pair<ChunkId, ChunkMetadata>>(1, std::make_pair(chunkIdRange.begin, *result));
} else if (result.error().code() == StorageCode::kChunkMetadataNotFound) {
reportGuard.succ();
return std::vector<std::pair<ChunkId, ChunkMetadata>>{};
} else {
return makeError(std::move(result.error()));
}
}
if (useChunkEngine()) {
return ChunkEngine::queryChunks(*engine_, chunkIdRange, chainId());
}
return chunkStore_.queryChunks(chunkIdRange);
}
Result<ChunkMetadata> StorageTarget::queryChunk(const ChunkId &chunkId) {
if (useChunkEngine()) {
return ChunkEngine::queryChunk(*engine_, chunkId, chainId());
}
auto getResult = chunkStore_.get(chunkId);
RETURN_AND_LOG_ON_ERROR(getResult);
return (*getResult)->second.meta;
}
Result<Void> StorageTarget::reportUnrecycledSize() {
targetUsedSize_->set(usedSize());
if (useChunkEngine()) {
return Void{};
}
int64_t reseredSize = 0;
int64_t unrecycledSize = 0;
auto result = chunkStore_.unusedSize(reseredSize, unrecycledSize);
if (UNLIKELY(!result)) {
targetReservedSize_->set(-1);
targetUnrecycledSize_->set(-1);
XLOGF(ERR, "target get unused size failed, {}, error: {}", targetConfig_.target_id, result.error());
} else {
unusedSize_ = reseredSize + unrecycledSize;
targetReservedSize_->set(reseredSize);
targetUnrecycledSize_->set(unrecycledSize);
}
return Void{};
}
Result<Void> StorageTarget::getAllMetadata(ChunkMetaVector &metadataVec) {
if (useChunkEngine()) {
return ChunkEngine::getAllMetadata(*engine_, chainId(), metadataVec);
} else {
return chunkStore_.getAllMetadata(metadataVec);
}
}
Result<Void> StorageTarget::getAllMetadataMap(std::unordered_map<ChunkId, ChunkMetadata> &metas) {
if (useChunkEngine()) {
return ChunkEngine::getAllMetadataMap(*engine_, metas, chainId());
} else {
auto iteratorResult = chunkStore_.metaIterator();
RETURN_AND_LOG_ON_ERROR(iteratorResult);
for (auto &it = *iteratorResult; it.valid(); it.next()) {
auto chunkId = it.chunkId();
auto metaResult = it.meta();
if (UNLIKELY(!metaResult)) {
auto msg = fmt::format("storage target dump parse meta failed: {}, chunk {}", metaResult.error(), chunkId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
metas[it.chunkId()] = *metaResult;
}
return iteratorResult->status();
}
}
void StorageTarget::recordRealRead(uint32_t bytes, Duration latency) const {
readSuccBytesPerDisk_->addSample(bytes);
readSuccLatencyPerDisk_->addSample(latency);
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,188 @@
#pragma once
#include <folly/Synchronized.h>
#include <unordered_map>
#include "chunk_engine/src/cxx.rs.h"
#include "common/monitor/Recorder.h"
#include "common/utils/CoLockManager.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/LockManager.h"
#include "common/utils/Path.h"
#include "storage/aio/BatchReadJob.h"
#include "storage/store/ChunkEngine.h"
#include "storage/store/ChunkStore.h"
#include "storage/store/PhysicalConfig.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
class StorageTarget : public enable_shared_from_this<StorageTarget> {
protected:
StorageTarget(const ChunkStore::Config &config,
GlobalFileStore &globalFileStore,
uint32_t diskIndex,
chunk_engine::Engine *engine);
public:
using Config = ChunkStore::Config;
~StorageTarget();
// create storage target.
Result<Void> create(const PhysicalConfig &config);
// load storage target.
Result<Void> load(const Path &path);
// add new chunk size.
Result<Void> addChunkSize(const std::vector<Size> &sizeList);
// get target id. [guaranteed loaded]
TargetId targetId() const { return TargetId{targetConfig_.target_id}; }
// get chain id. [guaranteed loaded]
ChainId chainId() const { return ChainId{targetConfig_.chain_id}; }
// set chain id.
Result<Void> setChainId(ChainId chainId);
// get disk index.
uint32_t diskIndex() const { return diskIndex_; }
// get target path. [guaranteed loaded]
Path path() const { return targetConfig_.path; }
// get all chunk metadata
Result<Void> getAllMetadata(ChunkMetaVector &metadataVec);
Result<Void> getAllMetadataMap(std::unordered_map<ChunkId, ChunkMetadata> &metas);
// lock chunk.
auto lockChunk(folly::coro::Baton &baton, const ChunkId &chunk, const std::string &tag) {
return chunkLocks_.lock(baton, chunk.data(), tag);
}
// try lock channel.
auto tryLockChannel(folly::coro::Baton &baton, const std::string &key) { return channelLocks_.tryLock(baton, key); }
// prepare aio read.
Result<Void> aioPrepareRead(AioReadJob &job);
// finish aio read.
Result<Void> aioFinishRead(AioReadJob &job);
// update chunk (write/remove/truncate).
void updateChunk(UpdateJob &job, folly::CPUThreadPoolExecutor &executor);
// query chunks: the chunk ids in result are in reverse lexicographical order
Result<std::vector<std::pair<ChunkId, ChunkMetadata>>> queryChunks(const ChunkIdRange &chunkIdRange);
// query chunk.
Result<ChunkMetadata> queryChunk(const ChunkId &chunkId);
// recycle a batch of chunks. return true if all holes are punched.
Result<bool> punchHole() {
if (useChunkEngine()) {
return true;
} else {
return chunkStore_.punchHole();
}
}
// sync meta kv.
Result<Void> sync() {
if (useChunkEngine()) {
return Void{};
} else {
return chunkStore_.sync();
}
}
// report unrecycled size.
Result<Void> reportUnrecycledSize();
// get used size.
uint64_t usedSize() const {
if (useChunkEngine()) {
return ChunkEngine::chainUsedSize(*engine_, ChainId{targetConfig_.chain_id});
} else {
return chunkStore_.usedSize();
}
}
// get unused size.
uint64_t unusedSize() const { return unusedSize_; }
// get all uncommitted chunk ids.
Result<std::vector<ChunkId>> uncommitted() {
if (useChunkEngine()) {
return ChunkEngine::queryUncommittedChunks(*engine_, chainId());
} else {
return chunkStore_.uncommitted();
}
}
// reset uncommitted chunk to committed state.
Result<Void> resetUncommitted(ChainVer chainVer) {
if (useChunkEngine()) {
return ChunkEngine::resetUncommittedChunks(*engine_, chainId(), chainVer);
} else {
return chunkStore_.resetUncommitted(chainVer);
}
}
// enable or disable emergency recycling.
void setEmergencyRecycling(bool enable) {
if (useChunkEngine()) {
return;
} else {
return chunkStore_.setEmergencyRecycling(enable);
}
}
// record real read.
void recordRealRead(uint32_t bytes, Duration latency) const;
// disk monitor tag.
auto &tag() const { return diskTag_; }
// check alive or not.
std::weak_ptr<bool> aliveWeakPtr() const { return alive_; }
// global serial number.
auto generationId() const { return generationId_; }
// release self.
Result<Void> release() {
released_ = true;
return sync();
}
// check if chunk engine is used.
inline bool useChunkEngine() const { return targetConfig_.only_chunk_engine; }
private:
const Config &config_;
std::shared_ptr<bool> alive_ = std::make_shared<bool>();
uint32_t diskIndex_;
uint32_t generationId_;
chunk_engine::Engine *engine_{};
std::atomic<uint64_t> unusedSize_{};
monitor::TagSet diskTag_;
monitor::TagSet targetTag_;
monitor::Recorder::TagRef<monitor::CountRecorder> readCountPerDisk_;
monitor::Recorder::TagRef<monitor::CountRecorder> readBytesPerDisk_;
monitor::Recorder::TagRef<monitor::CountRecorder> readSuccBytesPerDisk_;
monitor::Recorder::TagRef<monitor::LatencyRecorder> readSuccLatencyPerDisk_;
monitor::Recorder::TagRef<monitor::ValueRecorder> targetUsedSize_;
monitor::Recorder::TagRef<monitor::ValueRecorder> targetReservedSize_;
monitor::Recorder::TagRef<monitor::ValueRecorder> targetUnrecycledSize_;
PhysicalConfig targetConfig_;
ChunkStore chunkStore_;
CoLockManager<> chunkLocks_;
CoLockManager<> channelLocks_;
folly::Synchronized<std::set<Size>, std::mutex> chunkSizeList_;
bool released_ = false;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,297 @@
#include "storage/store/StorageTargets.h"
#include <boost/filesystem/operations.hpp>
#include <folly/experimental/coro/BlockingWait.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/experimental/coro/Sleep.h>
#include <folly/experimental/coro/Task.h>
#include <memory>
#include <sys/statvfs.h>
#include <unordered_map>
#include "chunk_engine/src/cxx.rs.h"
#include "common/monitor/Sample.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/Duration.h"
#include "common/utils/LogCommands.h"
#include "common/utils/Result.h"
#include "common/utils/SysResource.h"
#include "storage/service/Components.h"
namespace hf3fs::storage {
using namespace std::chrono_literals;
StorageTargets::~StorageTargets() { void(); }
Result<Void> StorageTargets::init(CPUExecutorGroup &executor) {
auto diskInfoResult = SysResource::scanDiskInfo();
RETURN_AND_LOG_ON_ERROR(diskInfoResult);
std::unordered_map<uint32_t, std::string> deviceIdToManufacturer;
for (auto &info : *diskInfoResult) {
deviceIdToManufacturer[info.deviceId] = info.manufacturer;
}
targetPaths_ = config_.target_paths();
for (auto &path : targetPaths_) {
struct stat st;
int succ = ::stat(path.c_str(), &st);
if (succ != 0) {
auto msg = fmt::format("stat {} failed: {}", path, errno);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageStatFailed, std::move(msg));
}
manufacturers_.push_back(deviceIdToManufacturer[st.st_dev]);
}
uint32_t i = 0;
for (auto &path : targetPaths_) {
pathToDiskIndex_[path] = i++;
}
std::vector<folly::coro::TaskWithExecutor<Result<rust::Box<chunk_engine::Engine>>>> tasks;
for (auto &path : targetPaths_) {
auto engine_path = path / "engine";
bool create = !boost::filesystem::exists(engine_path);
create |= config_.create_engine_path();
tasks.push_back(folly::coro::co_invoke([engine_path, create]() -> CoTryTask<rust::Box<chunk_engine::Engine>> {
std::string error;
auto engine = chunk_engine::create(engine_path.c_str(), create, sizeof(ChainId), error);
if (!error.empty()) {
co_return makeError(StorageCode::kStorageStatFailed, std::move(error));
}
co_return engine;
}).scheduleOn(&executor.pickNext()));
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
for (auto &result : results) {
RETURN_AND_LOG_ON_ERROR(result);
engines_.push_back(std::move(result.value()));
}
return Void{};
}
Result<Void> StorageTargets::create(const CreateConfig &createConfig) {
CPUExecutorGroup executor(1, "Creator");
RETURN_AND_LOG_ON_ERROR(init(executor));
auto targetPaths = config_.target_paths();
auto targetNumPerPath = config_.target_num_per_path();
auto targetIdSize = createConfig.target_ids().size();
if (targetPaths.empty()) {
auto msg = fmt::format("List of target path is empty");
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (targetNumPerPath == 0) {
auto msg = fmt::format("Target num per path is 0!");
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (targetPaths.size() * targetNumPerPath != targetIdSize) {
auto msg = fmt::format("Unable to arrange target. path size {}, target num per path {}, target id size {}",
targetPaths.size(),
targetNumPerPath,
targetIdSize);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, msg);
}
size_t idx = 0;
for (auto &targetId : createConfig.target_ids()) {
auto diskIndex = idx / targetNumPerPath;
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
globalFileStore_,
diskIndex,
&*engines_[diskIndex]);
PhysicalConfig targetConfig;
targetConfig.path = targetPaths[diskIndex] / std::to_string(targetId);
targetConfig.target_id = targetId;
targetConfig.allow_disk_without_uuid = createConfig.allow_disk_without_uuid();
targetConfig.allow_existing_targets = createConfig.allow_existing_targets();
targetConfig.physical_file_count = createConfig.physical_file_count();
targetConfig.chunk_size_list = createConfig.chunk_size_list();
targetConfig.only_chunk_engine = createConfig.only_chunk_engine();
RETURN_AND_LOG_ON_ERROR(storageTarget->create(targetConfig));
++idx;
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
}
return Void{};
}
Result<Void> StorageTargets::create(const CreateTargetReq &req) {
if (req.diskIndex >= config_.target_paths().size()) {
auto msg = fmt::format("disk index exceed {} >= {}", req.diskIndex, config_.target_paths().size());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (req.chainId == ChainId{}) {
auto msg = fmt::format("target {} without chain id", req.targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
folly::coro::Baton baton;
auto lock = targetLocks_.lock(baton, fmt::to_string(req.chainId));
if (!lock.locked()) {
folly::coro::blockingWait(lock.lock());
}
if (auto existingTarget = targetMap_.snapshot()->getByChainId(VersionedChainId{req.chainId, {}}, true)) {
auto existingTargetId = (*existingTarget)->targetId;
if (existingTargetId != req.targetId) {
auto msg = fmt::format("target {} is existing with same chain id {}, req target {}",
existingTargetId,
req.chainId,
req.targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (req.addChunkSize) {
RETURN_AND_LOG_ON_ERROR((*existingTarget)->storageTarget->addChunkSize(req.chunkSizeList));
}
} else if (req.addChunkSize) {
auto msg = fmt::format("target {} {} is not existing", req.chainId, req.targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
if (targetMap_.snapshot()->getTarget(req.targetId)) {
if (req.allowExistingTarget) {
auto targetPath = config_.target_paths()[req.diskIndex] / std::to_string(req.targetId);
if (!boost::filesystem::exists(targetPath)) {
auto msg = fmt::format("target {} is existing in memory, but not found in disk", req.targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
XLOGF(INFO, "target {} is already existing, return succ", req.targetId);
return Void{};
} else {
auto msg = fmt::format("target {} is already existing", req.targetId);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
}
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
globalFileStore_,
req.diskIndex,
&*engines_[req.diskIndex]);
PhysicalConfig targetConfig;
auto targetPath = config_.target_paths()[req.diskIndex] / std::to_string(req.targetId);
targetConfig.path = targetPath;
targetConfig.target_id = req.targetId;
targetConfig.chain_id = req.chainId;
targetConfig.allow_disk_without_uuid = config_.allow_disk_without_uuid();
targetConfig.allow_existing_targets = req.allowExistingTarget;
targetConfig.physical_file_count = req.physicalFileCount;
targetConfig.chunk_size_list = req.chunkSizeList;
targetConfig.kv_store_type = config_.storage_target().kv_store().type();
targetConfig.only_chunk_engine = req.onlyChunkEngine;
RETURN_AND_LOG_ON_ERROR(storageTarget->create(targetConfig));
XLOGF(INFO, "Create storage target {} at {}", storageTarget->targetId(), targetPath.string());
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
return Void{};
}
Result<Void> StorageTargets::load(CPUExecutorGroup &executor) {
RETURN_AND_LOG_ON_ERROR(init(executor));
std::vector<folly::coro::TaskWithExecutor<Result<Void>>> tasks;
for (auto &parentPath : config_.target_paths()) {
auto writable = CheckWorker::checkWritable(parentPath);
if (!writable) {
XLOGF(DFATAL, "path {} isn't writable, skip it", parentPath);
}
for (auto &targetPath : boost::filesystem::directory_iterator(parentPath)) {
auto targetConfigPath = targetPath / kPhysicalConfigFileName;
if (boost::filesystem::is_directory(targetPath) && boost::filesystem::is_regular_file(targetConfigPath)) {
tasks.push_back(folly::coro::co_invoke([this, targetPath]() -> CoTryTask<Void> {
co_return loadTarget(targetPath);
}).scheduleOn(&executor.pickNext()));
}
}
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(tasks)));
for (auto &result : results) {
RETURN_AND_LOG_ON_ERROR(result);
}
if (config_.collect_all_fds()) {
globalFileStore_.collect(fds_);
}
return Void{};
}
// load a target.
Result<Void> StorageTargets::loadTarget(const Path &targetPath) {
auto diskPath = targetPath.parent_path();
if (UNLIKELY(!pathToDiskIndex_.contains(diskPath))) {
auto msg = fmt::format("Target path ({}) not belongs to any of disk paths", targetPath);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
auto diskIndex = pathToDiskIndex_[diskPath];
auto storageTarget = StorageTarget::enable_shared_from_this::create(config_.storage_target(),
globalFileStore_,
diskIndex,
&*engines_[diskIndex]);
RETURN_AND_LOG_ON_ERROR(storageTarget->load(targetPath));
XLOGF(INFO, "Load storage target {} at {}", storageTarget->targetId(), targetPath.string());
auto targetId = storageTarget->targetId();
if (UNLIKELY(targetPath.filename().string() != fmt::format("{}", targetId.toUnderType()))) {
auto msg = fmt::format("Target id {} and path {} mismatch!", targetId, targetPath);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
RETURN_AND_LOG_ON_ERROR(targetMap_.addStorageTarget(storageTarget));
return Void{};
}
Result<std::vector<SpaceInfo>> StorageTargets::spaceInfos(bool force) {
folly::coro::Baton baton;
auto lock = targetLocks_.lock(baton, "spaceInfos");
if (!lock.locked()) {
folly::coro::blockingWait(lock.lock());
}
auto now = RelativeTime::now();
auto elapsedTime = now - spaceInfoUpdatedTime_;
if (elapsedTime < config_.space_info_cache_timeout() && !force) {
return cachedSpaceInfos_;
}
std::unordered_map<std::string, uint64_t> diskUnusedSize;
std::unordered_map<std::string, std::vector<hf3fs::flat::TargetId>> pathToTargetIds;
auto snapshot = targetMap_.snapshot();
for (auto &[targetId, target] : snapshot->getTargets()) {
pathToTargetIds[target.path.parent_path().string()].emplace_back(targetId);
if (target.storageTarget != nullptr) {
diskUnusedSize[target.path.parent_path().string()] += target.storageTarget->unusedSize();
}
}
std::vector<SpaceInfo> ret;
for (auto &[path, index] : pathToDiskIndex_) {
SpaceInfo info;
info.path = targetPaths_[index].string();
info.targetIds = pathToTargetIds[info.path];
boost::system::error_code ec{};
auto spaceInfo = boost::filesystem::space(path, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("get space info of directory {} failed: {}", path, ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kChunkOpenFailed, std::move(msg));
}
auto usedSize = engines_[index]->raw_used_size();
info.capacity = spaceInfo.capacity;
info.free = spaceInfo.free + diskUnusedSize[info.path] + usedSize.reserved_size;
info.available = spaceInfo.available;
info.manufacturer = manufacturers_[index];
ret.push_back(std::move(info));
}
cachedSpaceInfos_ = ret;
spaceInfoUpdatedTime_ = RelativeTime::now();
return ret;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,104 @@
#pragma once
#include <folly/executors/CPUThreadPoolExecutor.h>
#include "chunk_engine/src/cxx.rs.h"
#include "common/utils/CPUExecutorGroup.h"
#include "common/utils/CoLockManager.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/RobinHood.h"
#include "fbs/mgmtd/HeartbeatInfo.h"
#include "fbs/storage/Common.h"
#include "storage/service/TargetMap.h"
#include "storage/store/StorageTarget.h"
namespace hf3fs::test {
struct StorageTargetsHelper;
}
namespace hf3fs::storage {
class StorageTargets {
public:
class Config : public ConfigBase<Config> {
CONFIG_ITEM(target_paths, std::vector<Path>{}, [](auto &vec) { return !vec.empty(); });
CONFIG_ITEM(target_num_per_path, 0u);
CONFIG_HOT_UPDATED_ITEM(collect_all_fds, true);
CONFIG_HOT_UPDATED_ITEM(space_info_cache_timeout, 5_s);
CONFIG_HOT_UPDATED_ITEM(allow_disk_without_uuid, false);
CONFIG_HOT_UPDATED_ITEM(create_engine_path, true);
CONFIG_OBJ(storage_target, StorageTarget::Config);
};
class CreateConfig : public ConfigBase<CreateConfig> {
CONFIG_ITEM(target_ids, std::vector<flat::TargetId::UnderlyingType>{});
CONFIG_ITEM(physical_file_count, 256u);
CONFIG_ITEM(allow_disk_without_uuid, false);
CONFIG_ITEM(allow_existing_targets, false);
CONFIG_ITEM(chunk_size_list, (std::vector<Size>{512_KB, 1_MB, 2_MB, 4_MB, 16_MB, 64_MB}));
CONFIG_ITEM(only_chunk_engine, false);
};
StorageTargets(const Config &config, AtomicallyTargetMap &targetMap)
: config_(config),
targetMap_(targetMap) {}
~StorageTargets();
Result<Void> init(CPUExecutorGroup &executor);
// create a batch of storage targets.
Result<Void> create(const CreateConfig &createConfig);
// create new storage target.
Result<Void> create(const CreateTargetReq &req);
// open a batch of storage targets.
Result<Void> load(CPUExecutorGroup &executor);
// load a target.
Result<Void> loadTarget(const Path &targetPath);
// get fd list.
auto &fds() const { return fds_; }
// get space info.
Result<std::vector<SpaceInfo>> spaceInfos(bool force);
// get target paths.
auto &targetPaths() const { return targetPaths_; }
// get manufacturers.
auto &manufacturers() const { return manufacturers_; }
// global file store.
auto &globalFileStore() { return globalFileStore_; }
// chunk engines.
auto &engines() const { return engines_; }
// remove target.
Result<Void> removeChunkEngineTarget(ChainId chainId, uint32_t diskIndex) {
auto &engine = *engines_[diskIndex];
return ChunkEngine::removeAllChunks(engine, chainId);
}
private:
friend struct test::StorageTargetsHelper;
ConstructLog<"storage::StorageTargets"> constructLog_;
const Config &config_;
AtomicallyTargetMap &targetMap_;
GlobalFileStore globalFileStore_;
std::vector<Path> targetPaths_;
std::vector<std::string> manufacturers_;
std::map<Path, uint32_t> pathToDiskIndex_;
std::vector<rust::Box<chunk_engine::Engine>> engines_;
CoLockManager<> targetLocks_;
RelativeTime spaceInfoUpdatedTime_;
std::vector<SpaceInfo> cachedSpaceInfos_;
std::vector<int> fds_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,460 @@
#include "storage/sync/ResyncWorker.h"
#include <folly/ScopeGuard.h>
#include <folly/experimental/coro/Collect.h>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "fbs/storage/Common.h"
#include "storage/service/Components.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
namespace {
monitor::OperationRecorder resyncRecorder{"storage.resync"};
monitor::CountRecorder resyncRoutingVersionMismatch{"storage.resync.routing_version_mismatch"};
monitor::OperationRecorder syncingWriteRecorder{"storage.syncing.write_count"};
monitor::OperationRecorder syncingRemoveRecorder{"storage.syncing.remove_count"};
monitor::CountRecorder syncingSkipCount{"storage.syncing.skip_count"};
monitor::CountRecorder syncingRemoteMissCount{"storage.syncing.remote_miss_count"};
monitor::CountRecorder syncingRemoteChainVersionLowCount{"storage.syncing.chain_version_low"};
monitor::CountRecorder syncingRemoteChainVersionHighCount{"storage.syncing.chain_version_high"};
monitor::CountRecorder syncingLocalUncommittedCount{"storage.syncing.local_uncommitted"};
monitor::CountRecorder syncingCommitVersionMismatchCount{"storage.syncing.commit_version_mismatch"};
monitor::CountRecorder syncingCurrentChainIsWritingCount{"storage.syncing.current_chain_is_writing"};
monitor::CountRecorder syncingRemoteUncommittedCount{"storage.syncing.remote_uncommitted"};
monitor::CountRecorder syncingRemoteFullSyncLightCount{"storage.syncing.full_sync_light"};
monitor::CountRecorder syncingRemoteFullSyncHeavyCount{"storage.syncing.full_sync_heavy"};
monitor::CountRecorder syncingLocalChunkInRecycleState{"storage.syncing.chunk_in_recycle_state"};
monitor::CountRecorder syncingSkipRemoveAfterUpdate{"storage.syncing.skip_remove_after_update"};
monitor::CountRecorder syncingSkipUpdateAfterRemove{"storage.syncing.skip_update_after_remove"};
monitor::ValueRecorder syncingRemainingTargetsCount{"storage.syncing.remaining_targets_count", std::nullopt, false};
monitor::ValueRecorder syncingRemainingChunksCount{"storage.syncing.remaining_chunks_count", std::nullopt, false};
} // namespace
ResyncWorker::ResyncWorker(const Config &config, Components &components)
: config_(config),
components_(components),
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
std::make_shared<folly::NamedThreadFactory>("Sync")),
pool_(config_.pool(), &executors_),
updateChannelAllocator_(config_.num_channels()),
batchConcurrencyLimiter_(config_.batch_concurrency_limiter()) {}
Result<Void> ResyncWorker::start() {
RETURN_AND_LOG_ON_ERROR(
pool_.start([this](VersionedChainId vChainId) -> CoTask<void> { co_await handleSync(vChainId); }));
executors_.add([this] { loop(); });
started_ = true;
return Void{};
}
Result<Void> ResyncWorker::stopAndJoin() {
stopping_ = true;
cond_.notify_one();
for (int i = 0; started_ && !stopped_; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for ResyncWorker@{}::loop stop...", fmt::ptr(this));
std::this_thread::sleep_for(100_ms);
}
pool_.stopAndJoin();
return Void{};
}
void ResyncWorker::loop() {
while (!stopping_) {
auto lock = std::unique_lock(mutex_);
if (cond_.wait_for(lock, 500_ms, [&] { return stopping_.load(); })) {
break;
}
auto syncingChains = components_.targetMap.snapshot()->syncingChains();
syncingRemainingTargetsCount.set(syncingChains.size());
std::shuffle(syncingChains.begin(), syncingChains.end(), std::mt19937{std::random_device{}()});
for (auto &vChainId : syncingChains) {
if (stopping_) {
break;
}
bool succ = shards_.withLock(
[vChainId](SyncingChainIds &syncingChainIds) {
auto &status = syncingChainIds[vChainId.chainId];
if (!status.isSyncing && RelativeTime::now() - status.lastSyncingTime > 30_s) {
status.isSyncing = true;
return true;
} else {
XLOGF(DBG, "chain id {} is syncing", vChainId.chainId);
return false;
}
},
vChainId.chainId);
if (succ) {
pool_.enqueueSync(vChainId);
}
}
}
stopped_ = true;
XLOGF(INFO, "ResyncWorker@{}::loop stopped", fmt::ptr(this));
}
CoTryTask<void> ResyncWorker::handleSync(VersionedChainId vChainId) {
auto fullSyncLevel = config_.full_sync_level();
auto needFullSync = fullSyncLevel != FullSyncLevel::NONE &&
(config_.full_sync_chains().empty() || config_.full_sync_chains().contains(vChainId.chainId));
bool heavyFullSync = needFullSync && fullSyncLevel == FullSyncLevel::HEAVY;
// 1. Cancel the syncing state on exit.
auto guard = folly::makeGuard([&] {
shards_.withLock(
[&](SyncingChainIds &syncingChainIds) {
XLOGF(DBG9, "sync exit chain {}", vChainId);
auto &status = syncingChainIds[vChainId.chainId];
status.isSyncing = false;
status.lastSyncingTime = RelativeTime::now();
},
vChainId.chainId);
});
XLOGF(DBG9, "start sync chain {}", vChainId);
// 2. find target and routing.
auto targetResult = components_.targetMap.getByChainId(vChainId);
if (UNLIKELY(!targetResult)) {
auto msg = fmt::format("sync start {} get routing failed: {}", vChainId, targetResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
auto target = std::move(*targetResult);
auto targetId = target->targetId;
ClientId clientId{};
static_assert(sizeof(ClientId::uuid) == sizeof(VersionedChainId) + sizeof(TargetId));
*reinterpret_cast<VersionedChainId *>(clientId.uuid.data) = vChainId;
*reinterpret_cast<TargetId *>(clientId.uuid.data + sizeof(VersionedChainId)) = targetId;
monitor::TagSet tag;
tag.addTag("instance", fmt::format("{}-{}", targetId, vChainId.chainVer));
uint32_t currentSyncingRemoteMissCount = 0;
uint32_t currentSyncingRemoteChainVersionLowCount = 0;
uint32_t currentSyncingRemoteChainVersionHighCount = 0;
uint32_t currentSyncingRemoteUncommittedCount = 0;
uint32_t currentSyncingLocalUncommittedCount = 0;
uint32_t currentSyncingCommitVersionMismatchCount = 0;
uint32_t currentSyncingCurrentChainIsWritingCount = 0;
uint32_t currentSyncingRemoteFullSyncHeavyCount = 0;
uint32_t currentSyncingRemoteFullSyncLightCount = 0;
uint32_t currentSyncingSkipCount = 0;
auto recordGuard = resyncRecorder.record(tag);
auto remainingChunksCount = syncingRemainingChunksCount.getRecoderWithTag(tag);
SCOPE_EXIT { remainingChunksCount->set(0); };
// 3. sync start.
net::UserRequestOptions options;
options.timeout = config_.sync_start_timeout();
std::vector<ChunkMeta> remoteMetas;
auto addrResult = target->getSuccessorAddr();
if (UNLIKELY(!addrResult)) {
XLOGF(ERR, "sync start get successor addr error: {}", addrResult.error());
co_return makeError(std::move(addrResult.error()));
}
{
SyncStartReq syncStartReq;
syncStartReq.vChainId = vChainId;
auto syncStartResult = co_await components_.messenger.syncStart(*addrResult, syncStartReq, &options);
if (UNLIKELY(!syncStartResult)) {
if (syncStartResult.error().code() == StorageClientCode::kRoutingVersionMismatch) {
recordGuard.dismiss();
resyncRoutingVersionMismatch.addSample(1);
auto msg = fmt::format("sync start {} request failed: {}", vChainId, syncStartResult.error());
XLOG(DBG9, msg);
co_return makeError(std::move(syncStartResult.error()));
}
auto msg = fmt::format("sync start {} request failed: {}", vChainId, syncStartResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
remoteMetas = std::move(syncStartResult->metas);
}
// 3. syncing.
std::unordered_map<ChunkId, ChunkMetadata> localMetas;
auto result = target->storageTarget->getAllMetadataMap(localMetas);
if (UNLIKELY(!result)) {
XLOGF(ERR, "target invalid iterator {}, error {}", targetId, result.error());
co_return makeError(std::move(result.error()));
}
// re-check current chain version.
{
auto targetResult = components_.targetMap.getByChainId(vChainId);
if (UNLIKELY(!targetResult)) {
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
}
std::vector<std::pair<ChunkId, uint32_t>> writeList;
std::vector<ChunkId> removeList;
bool hasFatalEvents = false;
for (auto &remoteMeta : remoteMetas) {
// 1. check exists.
auto it = localMetas.find(remoteMeta.chunkId);
if (it == localMetas.end()) {
removeList.push_back(remoteMeta.chunkId);
continue;
}
SCOPE_EXIT { localMetas.erase(it); };
// 2. check recycle state.
const auto &chunkId = it->first;
const auto &meta = it->second;
if (UNLIKELY(meta.recycleState != RecycleState::NORMAL)) {
XLOGF(WARNING, "target {} chunk {} in recycle state: {}", targetId, chunkId, meta);
syncingLocalChunkInRecycleState.addSample(1);
continue; // skip chunk in recycle state.
}
// 3. handle updated write (local == remote).
bool needForward = true;
if (meta.chainVer > remoteMeta.chainVer) {
++currentSyncingRemoteChainVersionLowCount;
} else if (remoteMeta.updateVer != remoteMeta.commitVer || remoteMeta.chunkState != ChunkState::COMMIT) {
XLOGF(WARNING, "chain {} remote uncommitted {}", vChainId.chainId, remoteMeta);
++currentSyncingRemoteUncommittedCount;
} else if (meta.chainVer < remoteMeta.chainVer) {
if (meta.chunkState == ChunkState::COMMIT) {
++currentSyncingRemoteChainVersionHighCount;
XLOGF(DFATAL, "chain {} remote chain version high, local {}, remote {}", vChainId, meta, remoteMeta);
hasFatalEvents = true;
break;
} else {
needForward = false;
++currentSyncingLocalUncommittedCount;
XLOGF(CRITICAL, "chain {} local uncommitted, local {}, remote {}", vChainId, meta, remoteMeta);
}
} else if (meta.updateVer != remoteMeta.commitVer) {
if (meta.chainVer != vChainId.chainVer && meta.chunkState == ChunkState::COMMIT) {
++currentSyncingCommitVersionMismatchCount;
XLOGF(DFATAL, "chain {} commit version mismatch, local {}, remote {}", vChainId, meta, remoteMeta);
hasFatalEvents = true;
break;
} else {
needForward = false;
++currentSyncingCurrentChainIsWritingCount;
XLOGF(CRITICAL, "chain {} chain is writing, local {}, remote {}", vChainId, meta, remoteMeta);
}
} else if (heavyFullSync) {
++currentSyncingRemoteFullSyncHeavyCount;
} else if (meta.checksum() != remoteMeta.checksum) {
if (meta.chainVer != vChainId.chainVer) {
XLOGF(DFATAL, "chain {} checksum not equal, local {}, remote {}", vChainId, meta, remoteMeta);
++currentSyncingRemoteFullSyncLightCount;
hasFatalEvents = true;
break;
} else {
needForward = false;
++currentSyncingCurrentChainIsWritingCount;
XLOGF(CRITICAL,
"chain {} checksum not equal because of writing, local {}, remote {}",
vChainId,
meta,
remoteMeta);
}
} else {
needForward = false;
}
if (needForward) {
writeList.emplace_back(chunkId, meta.innerFileId.chunkSize);
} else {
++currentSyncingSkipCount;
}
}
if (UNLIKELY(hasFatalEvents)) {
auto msg = fmt::format("sync {} has fatal events", vChainId);
XLOG(CRITICAL, msg);
OfflineTargetReq req;
req.targetId = targetId;
req.force = true;
CO_RETURN_AND_LOG_ON_ERROR(co_await components_.messenger.offlineTarget(*addrResult, req, &options));
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
for (auto &[chunkId, meta] : localMetas) {
writeList.emplace_back(chunkId, meta.innerFileId.chunkSize);
++currentSyncingRemoteMissCount;
}
syncingRemoteMissCount.addSample(currentSyncingRemoteMissCount, tag);
syncingRemoteChainVersionLowCount.addSample(currentSyncingRemoteChainVersionLowCount, tag);
syncingRemoteChainVersionHighCount.addSample(currentSyncingRemoteChainVersionHighCount, tag);
syncingLocalUncommittedCount.addSample(currentSyncingLocalUncommittedCount, tag);
syncingRemoteUncommittedCount.addSample(currentSyncingRemoteUncommittedCount, tag);
syncingCommitVersionMismatchCount.addSample(currentSyncingCommitVersionMismatchCount, tag);
syncingCurrentChainIsWritingCount.addSample(currentSyncingCurrentChainIsWritingCount, tag);
syncingRemoteFullSyncHeavyCount.addSample(currentSyncingRemoteFullSyncHeavyCount, tag);
syncingRemoteFullSyncLightCount.addSample(currentSyncingRemoteFullSyncLightCount, tag);
syncingSkipCount.addSample(currentSyncingSkipCount, tag);
auto batchSize = config_.batch_size();
auto remainingCount = writeList.size() + removeList.size();
remainingChunksCount->set(remainingCount);
for (auto batchStart = 0ul; batchStart < removeList.size(); batchStart += batchSize) {
auto targetResult = components_.targetMap.getByChainId(vChainId);
if (UNLIKELY(!targetResult)) {
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
target = std::move(*targetResult);
std::vector<CoTryTask<void>> batch;
for (auto idx = batchStart; idx < removeList.size() && idx < batchStart + batchSize; ++idx) {
batch.push_back(forward(target, tag, clientId, std::move(removeList[idx]), UpdateType::REMOVE, 0));
}
auto guard = batchConcurrencyLimiter_.lock(0);
auto results = co_await folly::coro::collectAllRange(std::move(batch));
for (auto &result : results) {
if (UNLIKELY(!result)) {
XLOGF(ERR, "target {} forward remove failed {}", targetId, result.error());
CO_RETURN_ERROR(result);
}
}
remainingCount -= results.size();
remainingChunksCount->set(remainingCount);
}
for (auto batchStart = 0ul; batchStart < writeList.size(); batchStart += batchSize) {
auto targetResult = components_.targetMap.getByChainId(vChainId);
if (UNLIKELY(!targetResult)) {
auto msg = fmt::format("sync re-check {} get routing failed: {}", vChainId, targetResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendStartFailed, std::move(msg));
}
target = std::move(*targetResult);
std::vector<CoTryTask<void>> batch;
for (auto idx = batchStart; idx < writeList.size() && idx < batchStart + batchSize; ++idx) {
auto &[chunkId, chunkSize] = writeList[idx];
batch.push_back(forward(target, tag, clientId, std::move(chunkId), UpdateType::WRITE, chunkSize));
}
auto guard = batchConcurrencyLimiter_.lock(0);
auto results = co_await folly::coro::collectAllRange(std::move(batch));
for (auto &result : results) {
if (UNLIKELY(!result)) {
XLOGF(ERR, "target {} forward write failed {}", targetId, result.error());
CO_RETURN_ERROR(result);
}
}
remainingCount -= results.size();
remainingChunksCount->set(remainingCount);
}
// 4. sync done.
{
SyncDoneReq syncDoneReq;
syncDoneReq.vChainId = vChainId;
auto addrResult = target->getSuccessorAddr();
if (UNLIKELY(!addrResult)) {
XLOGF(ERR, "sync start get successor addr error: {}", addrResult.error());
co_return makeError(std::move(addrResult.error()));
}
auto syncDoneResult = co_await components_.messenger.syncDone(*addrResult, syncDoneReq);
if (UNLIKELY(!syncDoneResult)) {
auto msg = fmt::format("sync done {} request failed: {}", vChainId, syncDoneResult.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendDoneFailed, std::move(msg));
}
if (UNLIKELY(!syncDoneResult->result.lengthInfo)) {
auto msg = fmt::format("sync done {} request failed: {}", vChainId, syncDoneResult->result.lengthInfo.error());
XLOG(ERR, msg);
co_return makeError(StorageCode::kSyncSendDoneFailed, std::move(msg));
}
}
recordGuard.succ();
XLOGF(INFO,
"sync done chain {} target {} update {} remove {}",
vChainId,
targetId,
writeList.size(),
removeList.size());
co_return Void{};
}
CoTryTask<void> ResyncWorker::forward(const TargetPtr &target,
const monitor::TagSet &tag,
const ClientId &clientId,
ChunkId chunkId,
UpdateType updateType,
uint32_t chunkSize) {
auto recordGuard =
updateType == UpdateType::REMOVE ? syncingRemoveRecorder.record(tag) : syncingWriteRecorder.record(tag);
folly::coro::Baton baton;
auto lockGuard = target->storageTarget->lockChunk(baton, chunkId, "sync");
if (!lockGuard.locked()) {
XLOGF(WARNING, "target {} chunk {} wait lock, current tag: {}", *target, chunkId, lockGuard.currentTag());
co_await lockGuard.lock();
}
auto chunkResult = target->storageTarget->queryChunk(chunkId);
if (chunkResult) {
// chunk exists.
if (updateType == UpdateType::REMOVE && chunkResult->recycleState == RecycleState::NORMAL) {
XLOGF(WARNING, "target {} chunk {} has been updated, skip remove", *target, chunkId);
syncingSkipRemoveAfterUpdate.addSample(1);
recordGuard.succ();
co_return Void{};
}
chunkSize = chunkResult->innerFileId.chunkSize; // use latest chunk size.
} else if (chunkResult.error().code() == StorageCode::kChunkMetadataNotFound) {
// chunk does not exist.
if (updateType == UpdateType::WRITE) {
XLOGF(WARNING, "target {} chunk {} has been removed, skip updated", *target, chunkId);
syncingSkipUpdateAfterRemove.addSample(1);
recordGuard.succ();
co_return Void{};
}
} else {
co_return makeError(std::move(chunkResult.error()));
}
UpdateChannel channel;
if (UNLIKELY(!updateChannelAllocator_.allocate(channel))) {
XLOGF(ERR, "no channel to forward sync write");
co_return makeError(StorageClientCode::kResourceBusy);
}
auto channelGuard = folly::makeGuard([&] { updateChannelAllocator_.release(channel); });
UpdateReq req;
req.payload.updateType = updateType;
req.payload.key.chunkId = chunkId;
req.payload.key.vChainId = target->vChainId;
req.payload.offset = 0;
req.payload.chunkSize = chunkSize;
req.payload.updateVer = ChunkVer{1};
req.tag.clientId = clientId;
req.payload.checksum.type = ChecksumType::CRC32C;
req.tag.requestId = RequestId{++requestId_};
req.tag.channel = channel;
req.options.fromClient = false;
req.options.isSyncing = true;
req.options.commitChainVer = target->vChainId.chainVer;
CommitIO commitIO;
TargetPtr t = target;
ServiceRequestContext requestCtx{"resync"};
ChunkEngineUpdateJob chunkEngineJob;
auto forwardResult =
co_await components_.reliableForwarding.forwardWithRetry(requestCtx, req, {}, chunkEngineJob, t, commitIO, false);
CO_RETURN_ON_ERROR(forwardResult.lengthInfo);
recordGuard.succ();
co_return Void{};
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,84 @@
#pragma once
#include <atomic>
#include <condition_variable>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <mutex>
#include "client/storage/UpdateChannelAllocator.h"
#include "common/serde/Serde.h"
#include "common/utils/ConcurrencyLimiter.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/CoroutinesPool.h"
#include "common/utils/Duration.h"
#include "common/utils/Shards.h"
#include "fbs/storage/Common.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
struct Components;
class ResyncWorker {
public:
enum FullSyncLevel {
NONE,
HEAVY, // sync all.
};
struct Config : ConfigBase<Config> {
CONFIG_ITEM(num_threads, 16ul);
CONFIG_ITEM(num_channels, 1024u);
CONFIG_HOT_UPDATED_ITEM(batch_size, 16u);
CONFIG_HOT_UPDATED_ITEM(sync_start_timeout, 10_s);
CONFIG_HOT_UPDATED_ITEM(full_sync_chains, std::set<uint32_t>{}); // full sync all chains if it is empty.
CONFIG_HOT_UPDATED_ITEM(full_sync_level, FullSyncLevel::NONE);
CONFIG_OBJ(pool, CoroutinesPoolBase::Config);
CONFIG_OBJ(batch_concurrency_limiter, ConcurrencyLimiterConfig, [](auto &c) { c.set_max_concurrency(64); });
};
ResyncWorker(const Config &config, Components &components);
// start resync worker.
Result<Void> start();
// stop resync worker. End all sync tasks immediately.
Result<Void> stopAndJoin();
protected:
void loop();
// handle sync job.
CoTryTask<void> handleSync(VersionedChainId vChainId);
// forward sync request.
CoTryTask<void> forward(const TargetPtr &target,
const monitor::TagSet &tag,
const ClientId &clientId,
ChunkId chunkId,
UpdateType updateType,
uint32_t chunkSize);
private:
ConstructLog<"storage::ResyncWorker"> constructLog_;
const Config &config_;
Components &components_;
folly::CPUThreadPoolExecutor executors_;
CoroutinesPool<VersionedChainId> pool_;
client::UpdateChannelAllocator updateChannelAllocator_;
ConcurrencyLimiter<uint32_t> batchConcurrencyLimiter_;
std::mutex mutex_;
std::condition_variable cond_;
std::atomic<bool> stopping_ = false;
std::atomic<bool> started_ = false;
std::atomic<bool> stopped_ = false;
struct SyncingStatus {
SERDE_STRUCT_FIELD(isSyncing, false);
SERDE_STRUCT_FIELD(lastSyncingTime, RelativeTime{});
};
using SyncingChainIds = robin_hood::unordered_map<ChainId, SyncingStatus>;
Shards<SyncingChainIds, 32> shards_;
std::atomic_uint64_t requestId_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,118 @@
#pragma once
#include <folly/experimental/coro/Baton.h>
#include "chunk_engine/src/cxx.rs.h"
#include "fbs/storage/Common.h"
#include "storage/store/ChunkMetadata.h"
#include "storage/store/ChunkStore.h"
namespace hf3fs::storage {
class StorageTarget;
class ChunkEngineUpdateJob {
public:
ChunkEngineUpdateJob() = default;
ChunkEngineUpdateJob(const ChunkEngineUpdateJob &) = delete;
ChunkEngineUpdateJob(ChunkEngineUpdateJob &&other)
: engine_(std::exchange(other.engine_, nullptr)),
chunk_(std::exchange(other.chunk_, nullptr)) {}
void set(chunk_engine::Engine &engine, chunk_engine::WritingChunk *chunk) {
reset();
engine_ = &engine;
chunk_ = chunk;
}
auto release() { return std::exchange(engine_, nullptr); }
auto chunk() const { return chunk_; }
void reset() {
if (engine_ && chunk_) {
release()->release_writing_chunk(chunk_);
}
}
~ChunkEngineUpdateJob() { reset(); }
private:
chunk_engine::Engine *engine_{};
chunk_engine::WritingChunk *chunk_{};
};
class UpdateJob {
public:
UpdateJob(ServiceRequestContext &requestCtx,
const UpdateIO &updateIO,
const UpdateOptions &options,
ChunkEngineUpdateJob &chunkEngineJob,
std::shared_ptr<StorageTarget> target,
bool allowToAllocate = true)
: requestCtx_(requestCtx),
type_(updateIO.updateType),
chunkId_(updateIO.key.chunkId),
target_(std::move(target)),
updateIO_(updateIO),
chunkEngineJob_(chunkEngineJob),
options_(options),
allowToAllocate_(allowToAllocate) {}
UpdateJob(ServiceRequestContext &requestCtx,
const CommitIO &commitIO,
const UpdateOptions &options,
ChunkEngineUpdateJob &chunkEngineJob,
std::shared_ptr<StorageTarget> target)
: requestCtx_(requestCtx),
type_(UpdateType::COMMIT),
chunkId_(commitIO.key.chunkId),
target_(std::move(target)),
commitIO_(commitIO),
chunkEngineJob_(chunkEngineJob),
options_(options) {}
auto &requestCtx() { return requestCtx_; }
auto type() const { return type_; }
const auto &chunkId() const { return chunkId_; }
auto &target() const { return target_; }
auto &updateIO() { return updateIO_; }
auto &commitIO() { return commitIO_; }
auto &chunkEngineJob() { return chunkEngineJob_; }
auto &options() { return options_; }
auto &result() { return result_; }
auto &state() { return state_; }
auto allowToAllocate() const { return allowToAllocate_; }
ChainVer commitChainVer() const {
if (options_.isSyncing) {
return options_.commitChainVer;
} else if (type() == UpdateType::COMMIT) {
return commitIO_.commitChainVer;
} else {
return updateIO_.key.vChainId.chainVer;
}
}
CoTask<void> complete() const { co_await baton_; }
void setResult(Result<uint32_t> result) {
result_.lengthInfo = std::move(result);
baton_.post();
}
protected:
ServiceRequestContext &requestCtx_;
UpdateType type_;
ChunkId chunkId_;
std::shared_ptr<StorageTarget> target_;
UpdateIO updateIO_;
CommitIO commitIO_;
ChunkEngineUpdateJob &chunkEngineJob_;
UpdateOptions options_;
IOResult result_;
folly::coro::Baton baton_;
struct State {
const uint8_t *data = nullptr;
} state_;
bool allowToAllocate_ = true;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,46 @@
#include "storage/update/UpdateWorker.h"
namespace hf3fs::storage {
Result<Void> UpdateWorker::start(uint32_t numberOfDisks) {
if (config_.num_threads() < numberOfDisks) {
return makeError(StatusCode::kInvalidConfig,
fmt::format("too few update worker threads, {} < {}", config_.num_threads(), numberOfDisks));
}
queueVec_.reserve(numberOfDisks);
for (auto i = 0u; i < numberOfDisks; ++i) {
queueVec_.emplace_back(std::make_unique<Queue>(config_.queue_size()));
}
for (auto i = 0u; i < config_.num_threads(); ++i) {
executors_.add([this, i] { run(*queueVec_[i % queueVec_.size()]); });
}
return Void{};
}
void UpdateWorker::stopAndJoin() {
if (stopped_.test_and_set()) {
return;
}
for (auto i = 0u; i < config_.num_threads() && !queueVec_.empty(); ++i) {
queueVec_[i % queueVec_.size()]->enqueue(nullptr);
}
executors_.join();
bgExecutors_.join();
}
void UpdateWorker::run(Queue &queue) {
while (true) {
auto job = queue.dequeue();
if (UNLIKELY(job == nullptr)) {
XLOGF(DBG, "Storage worker {} stop...", fmt::ptr(this));
break;
}
job->target()->updateChunk(*job, bgExecutors_);
}
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,48 @@
#pragma once
#include <folly/executors/CPUThreadPoolExecutor.h>
#include "common/utils/BoundedQueue.h"
#include "storage/store/StorageTargets.h"
#include "storage/update/UpdateJob.h"
namespace hf3fs::storage {
class UpdateWorker {
public:
class Config : public ConfigBase<Config> {
CONFIG_ITEM(queue_size, 4096u);
CONFIG_ITEM(num_threads, 32ul);
CONFIG_ITEM(bg_num_threads, 8ul);
};
UpdateWorker(const Config &config)
: config_(config),
executors_(std::make_pair(config_.num_threads(), config_.num_threads()),
std::make_shared<folly::NamedThreadFactory>("Update")),
bgExecutors_(std::make_pair(config_.bg_num_threads(), config_.bg_num_threads()),
std::make_shared<folly::NamedThreadFactory>("Recycle")) {}
~UpdateWorker() { stopAndJoin(); }
Result<Void> start(uint32_t numberOfDisks);
void stopAndJoin();
CoTask<void> enqueue(UpdateJob *job) {
assert(job->target()->diskIndex() < queueVec_.size());
co_await queueVec_[job->target()->diskIndex()]->co_enqueue(job);
}
protected:
using Queue = BoundedQueue<UpdateJob *>;
void run(Queue &queue);
private:
ConstructLog<"storage::UpdateWorker"> constructLog_;
const Config &config_;
std::vector<std::unique_ptr<Queue>> queueVec_;
folly::CPUThreadPoolExecutor executors_;
folly::CPUThreadPoolExecutor bgExecutors_;
std::atomic_flag stopped_;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,59 @@
#include "storage/worker/AllocateWorker.h"
#include <memory>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/mgmtd/MgmtdTypes.h"
#include "storage/service/Components.h"
namespace hf3fs::storage {
AllocateWorker::AllocateWorker(const Config &config, Components &components)
: config_(config),
components_(components),
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Allocate")) {}
Result<Void> AllocateWorker::start() {
executors_.add([this] { loop(); });
started_ = true;
return Void{};
}
Result<Void> AllocateWorker::stopAndJoin() {
stopping_ = true;
cond_.notify_one();
for (int i = 0; started_ && !stopped_; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for AllocateWorker@{}::loop stop...", fmt::ptr(this));
std::this_thread::sleep_for(100_ms);
}
executors_.join();
return Void{};
}
void AllocateWorker::loop() {
while (!stopping_) {
auto lock = std::unique_lock(mutex_);
if (cond_.wait_for(lock, 100_ms, [&] { return stopping_.load(); })) {
break;
}
auto minRemainGroups = config_.min_remain_groups();
auto maxRemainGroups = config_.max_remain_groups();
auto minRemainUltraGroups = config_.min_remain_ultra_groups();
auto maxRemainUltraGroups = config_.max_remain_ultra_groups();
auto maxReserved = config_.max_reserved_chunks();
for (auto &engine : components_.storageTargets.engines()) {
engine->allocate_groups(minRemainGroups, maxRemainGroups, 128);
engine->allocate_ultra_groups(minRemainUltraGroups, maxRemainUltraGroups, 32);
engine->compact_groups(maxReserved);
}
}
XLOGF(INFO, "AllocateWorker@{}::loop stopped", fmt::ptr(this));
stopped_ = true;
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,46 @@
#pragma once
#include <atomic>
#include <condition_variable>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <mutex>
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
struct Components;
class AllocateWorker {
public:
class Config : public ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(min_remain_groups, 4ul);
CONFIG_HOT_UPDATED_ITEM(max_remain_groups, 8ul);
CONFIG_HOT_UPDATED_ITEM(min_remain_ultra_groups, 0ul); // greater than 4MiB
CONFIG_HOT_UPDATED_ITEM(max_remain_ultra_groups, 4ul);
CONFIG_HOT_UPDATED_ITEM(max_reserved_chunks, 1_GB);
};
AllocateWorker(const Config &config, Components &components);
Result<Void> start();
Result<Void> stopAndJoin();
protected:
void loop();
private:
ConstructLog<"storage::AllocateWorker"> constructLog_;
const Config &config_;
Components &components_;
folly::CPUThreadPoolExecutor executors_;
std::mutex mutex_;
std::condition_variable cond_;
std::atomic<bool> stopping_ = false;
std::atomic<bool> started_ = false;
std::atomic<bool> stopped_ = false;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,284 @@
#include "storage/worker/CheckWorker.h"
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/UtcTime.h"
#include "fbs/mgmtd/MgmtdTypes.h"
#include "storage/service/Components.h"
namespace hf3fs::storage {
namespace {
monitor::ValueRecorder new_chunk_engine_count = monitor::ValueRecorder{"storage.chunk_engine.new", std::nullopt, false};
monitor::ValueRecorder old_chunk_engine_count = monitor::ValueRecorder{"storage.chunk_engine.old", std::nullopt, false};
struct Recorders {
monitor::ValueRecorder disk_capacity;
monitor::ValueRecorder disk_readonly;
monitor::ValueRecorder disk_available;
monitor::ValueRecorder disk_free;
monitor::OperationRecorder check_disk;
monitor::ValueRecorder position_count;
monitor::ValueRecorder position_rc;
monitor::CountRecorder copy_on_write_times;
monitor::LatencyRecorder copy_on_write_latency;
monitor::CountRecorder copy_on_write_read_times;
monitor::CountRecorder copy_on_write_read_bytes;
monitor::LatencyRecorder copy_on_write_read_latency;
monitor::CountRecorder checksum_reuse;
monitor::CountRecorder checksum_combine;
monitor::CountRecorder checksum_recalculate;
monitor::CountRecorder safe_write_direct_append;
monitor::CountRecorder safe_write_indirect_append;
monitor::CountRecorder safe_write_truncate_shorten;
monitor::CountRecorder safe_write_truncate_extend;
monitor::CountRecorder safe_write_read_tail_times;
monitor::CountRecorder safe_write_read_tail_bytes;
monitor::CountRecorder allocate_times;
monitor::LatencyRecorder allocate_latency;
monitor::CountRecorder pwrite_times;
monitor::LatencyRecorder pwrite_latency;
Recorders(const monitor::TagSet &tag)
: disk_capacity("storage.disk_info.capacity", tag, false),
disk_readonly("storage.disk_info.read_only", tag, false),
disk_available("storage.disk_info.available", tag, false),
disk_free("storage.disk_info.free", tag, false),
check_disk("storage.check_disk", tag),
position_count("storage.chunk_engine.position_count", tag, false),
position_rc("storage.chunk_engine.position_rc", tag, false),
copy_on_write_times("storage.chunk_engine.copy_on_write_times", tag),
copy_on_write_latency("storage.chunk_engine.copy_on_write_latency", tag),
copy_on_write_read_times("storage.chunk_engine.copy_on_write_read_times", tag),
copy_on_write_read_bytes("storage.chunk_engine.copy_on_write_read_bytes", tag),
copy_on_write_read_latency("storage.chunk_engine.copy_on_write_read_latency", tag),
checksum_reuse("storage.chunk_engine.checksum_reuse", tag),
checksum_combine("storage.chunk_engine.checksum_combine", tag),
checksum_recalculate("storage.chunk_engine.checksum_recalculate", tag),
safe_write_direct_append("storage.chunk_engine.safe_write_direct_append", tag),
safe_write_indirect_append("storage.chunk_engine.safe_write_indirect_append", tag),
safe_write_truncate_shorten("storage.chunk_engine.safe_write_truncate_shorten", tag),
safe_write_truncate_extend("storage.chunk_engine.safe_write_truncate_extend", tag),
safe_write_read_tail_times("storage.chunk_engine.safe_write_read_tail_times", tag),
safe_write_read_tail_bytes("storage.chunk_engine.safe_write_read_tail_bytes", tag),
allocate_times("storage.chunk_engine.allocate_times", tag),
allocate_latency("storage.chunk_engine.allocate_latency", tag),
pwrite_times("storage.chunk_engine.pwrite_times", tag),
pwrite_latency("storage.chunk_engine.pwrite_latency", tag) {}
};
} // namespace
CheckWorker::CheckWorker(const Config &config, Components &components)
: config_(config),
components_(components),
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Check")) {}
// start check worker.
Result<Void> CheckWorker::start(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers) {
executors_.add([this, targetPaths, manufacturers] { loop(targetPaths, manufacturers); });
started_ = true;
return Void{};
}
// stop check worker. End all tasks immediately.
Result<Void> CheckWorker::stopAndJoin() {
stopping_ = true;
cond_.notify_one();
for (int i = 0; started_ && !stopped_; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for CheckWorker@{}::loop stop...", fmt::ptr(this));
std::this_thread::sleep_for(100_ms);
}
executors_.join();
return Void{};
}
void CheckWorker::loop(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers) {
(void)manufacturers;
// 0. initialize records.
static auto recorders = [&] {
std::vector<std::unique_ptr<Recorders>> recorders;
for (auto i = 0ul; i < targetPaths.size(); ++i) {
monitor::TagSet tag;
tag.addTag("instance", std::to_string(i));
recorders.push_back(std::make_unique<Recorders>(tag));
}
return recorders;
}();
RelativeTime lastCheckDiskStatusTime{};
RelativeTime lastCleanUpExpiredClientsTime{};
RelativeTime lastTriggerHeartbeatTime{};
RelativeTime lastUpdateTargetUsedSizeTime = RelativeTime::now();
RelativeTime lastChunkEngineMetricsReportTime = RelativeTime::now();
robin_hood::unordered_map<uint32_t, double> diskUsage;
while (!stopping_) {
auto lock = std::unique_lock(mutex_);
if (cond_.wait_for(lock, 100_ms, [&] { return stopping_.load(); })) {
break;
}
// 1. reload offline targets.
{
auto snapshot = components_.targetMap.snapshot();
for (auto &[targetId, target] : snapshot->getTargets()) {
if (target.unrecoverableOffline()) {
continue;
}
if (target.localState == flat::LocalTargetState::OFFLINE) {
if (target.weakStorageTarget.expired()) {
auto result = components_.storageTargets.loadTarget(target.path);
if (UNLIKELY(!result)) {
XLOGF(ERR, "CheckWorker@{} reload target {} failed", fmt::ptr(this), target.path);
} else {
XLOGF(INFO, "CheckWorker@{} reload target {} succ", fmt::ptr(this), target.path);
components_.refreshRoutingInfo();
}
} else {
XLOGF(WARNING, "CheckWorker@{} offline target {} is still being used", fmt::ptr(this), target.path);
}
}
}
}
// 2. check disk status.
auto now = RelativeTime::now();
auto diskLowSpaceThreshold = config_.disk_low_space_threshold();
auto diskRejectCreateChunkThreshold = config_.disk_reject_create_chunk_threshold();
if (now - lastCheckDiskStatusTime >= 3_s) {
lastCheckDiskStatusTime = now;
XLOGF(DBG9, "check disk status start");
for (auto i = 0ul; i < targetPaths.size(); ++i) {
auto &targetPath = targetPaths[i];
auto &recorder = *recorders[i];
boost::system::error_code ec{};
auto spaceInfo = boost::filesystem::space(targetPath, ec);
if (UNLIKELY(ec.failed())) {
XLOGF(CRITICAL, "check disk failed {}, errno: {}", targetPath, ec.message());
components_.targetMap.offlineTargets(targetPath);
continue;
}
recorder.disk_capacity.set(spaceInfo.capacity);
recorder.disk_free.set(spaceInfo.available);
diskUsage[i] = 1.0 - (double)spaceInfo.available / std::max(1ul, spaceInfo.capacity);
auto recordGuard = recorder.check_disk.record();
bool writable = checkWritable(targetPath);
if (!writable) {
recorder.disk_readonly.set(1);
XLOGF(CRITICAL, "check disk failed {}, readonly", targetPath);
components_.targetMap.offlineTargets(targetPath);
continue;
}
recordGuard.report(true);
bool lowSpace = diskUsage[i] >= diskLowSpaceThreshold;
bool rejectCreateChunk = diskUsage[i] >= diskRejectCreateChunkThreshold;
components_.storageTargets.engines()[i]->set_allow_to_allocate(!rejectCreateChunk);
components_.targetMap.updateDiskState(targetPath, lowSpace, rejectCreateChunk);
}
XLOGF(DBG9, "check disk status finished");
}
// 3. clean up expired clients.
now = RelativeTime::now();
if (now - lastCleanUpExpiredClientsTime >= 60_s) {
lastCleanUpExpiredClientsTime = now;
auto result = components_.getActiveClientsList();
if (result) {
components_.reliableUpdate.cleanUpExpiredClients(*result);
} else if (result.error().code() != StorageClientCode::kRoutingError) {
XLOGF(ERR, "get active clients list error: {}", result.error());
}
}
// 4. update target used size.
now = RelativeTime::now();
auto emergencyRecyclingRatio = config_.emergency_recycling_ratio();
if (now - lastUpdateTargetUsedSizeTime >= config_.update_target_size_interval()) {
lastUpdateTargetUsedSizeTime = now;
components_.targetMap.updateTargetUsedSize();
robin_hood::unordered_map<uint32_t, uint64_t> diskUnusedSize;
robin_hood::unordered_map<uint32_t, std::pair<uint32_t, uint32_t>> chunkEngineCount;
auto snapshot = components_.targetMap.snapshot();
for (auto &[targetId, target] : snapshot->getTargets()) {
if (!target.unrecoverableOffline() && target.localState != flat::LocalTargetState::OFFLINE &&
target.storageTarget != nullptr) {
target.storageTarget->reportUnrecycledSize();
target.storageTarget->setEmergencyRecycling(diskUsage[target.diskIndex] >= emergencyRecyclingRatio);
diskUnusedSize[target.diskIndex] += target.storageTarget->unusedSize();
if (target.storageTarget->useChunkEngine()) {
chunkEngineCount[target.diskIndex].first++;
} else {
chunkEngineCount[target.diskIndex].second++;
}
}
}
for (auto i = 0ul; i < targetPaths.size(); ++i) {
auto tag = monitor::instanceTagSet(std::to_string(i));
auto [new_count, old_count] = chunkEngineCount[i];
new_chunk_engine_count.set(new_count, tag);
old_chunk_engine_count.set(old_count, tag);
auto rawUsedSize = components_.storageTargets.engines()[i]->raw_used_size();
auto &recorder = *recorders[i];
recorder.disk_available.set(diskUnusedSize[i] + rawUsedSize.reserved_size + recorder.disk_free.value());
recorder.position_count.set(rawUsedSize.position_count);
recorder.position_rc.set(rawUsedSize.position_rc);
}
}
// 5. trigger heartbeat if need.
if (now - lastTriggerHeartbeatTime >= 1_s) {
lastTriggerHeartbeatTime = now;
components_.triggerHeartbeatIfNeed();
}
// 6. report chunk engine metrics.
now = RelativeTime::now();
if (now - lastChunkEngineMetricsReportTime >= 1_s) {
lastChunkEngineMetricsReportTime = now;
for (auto i = 0ul; i < targetPaths.size(); ++i) {
auto &recorder = *recorders[i];
auto metrics = components_.storageTargets.engines()[i]->get_metrics();
recorder.copy_on_write_times.addSample(metrics.copy_on_write_times);
if (metrics.copy_on_write_latency) {
recorder.copy_on_write_latency.addSample(std::chrono::microseconds(metrics.copy_on_write_latency));
}
recorder.copy_on_write_read_times.addSample(metrics.copy_on_write_read_times);
recorder.copy_on_write_read_bytes.addSample(metrics.copy_on_write_read_bytes);
if (metrics.copy_on_write_read_latency) {
recorder.copy_on_write_read_latency.addSample(std::chrono::microseconds(metrics.copy_on_write_read_latency));
}
recorder.checksum_reuse.addSample(metrics.checksum_reuse);
recorder.checksum_combine.addSample(metrics.checksum_combine);
recorder.checksum_recalculate.addSample(metrics.checksum_recalculate);
recorder.safe_write_direct_append.addSample(metrics.safe_write_direct_append);
recorder.safe_write_indirect_append.addSample(metrics.safe_write_indirect_append);
recorder.safe_write_truncate_shorten.addSample(metrics.safe_write_truncate_shorten);
recorder.safe_write_truncate_extend.addSample(metrics.safe_write_truncate_extend);
recorder.safe_write_read_tail_times.addSample(metrics.safe_write_read_tail_times);
recorder.safe_write_read_tail_bytes.addSample(metrics.safe_write_read_tail_bytes);
recorder.allocate_times.addSample(metrics.allocate_times);
if (metrics.allocate_latency) {
recorder.allocate_latency.addSample(std::chrono::microseconds(metrics.allocate_latency));
}
recorder.pwrite_times.addSample(metrics.pwrite_times);
if (metrics.pwrite_latency) {
recorder.pwrite_latency.addSample(std::chrono::microseconds(metrics.pwrite_latency));
}
}
}
}
stopped_ = true;
XLOGF(INFO, "CheckWorker@{}::loop stopped", fmt::ptr(this));
}
bool CheckWorker::checkWritable(const Path &path) {
std::ofstream check(path / ".hf3fs_check", std::ios::out);
return check && (check << fmt::format("{}", UtcTime{UtcClock::now()}));
}
} // namespace hf3fs::storage

View File

@@ -0,0 +1,51 @@
#pragma once
#include <atomic>
#include <condition_variable>
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <mutex>
#include "common/utils/ConfigBase.h"
#include "storage/service/TargetMap.h"
namespace hf3fs::storage {
struct Components;
class CheckWorker {
public:
class Config : public ConfigBase<Config> {
CONFIG_HOT_UPDATED_ITEM(update_target_size_interval, 10_s);
CONFIG_HOT_UPDATED_ITEM(emergency_recycling_ratio, 0.95);
CONFIG_HOT_UPDATED_ITEM(disk_low_space_threshold, 0.96);
CONFIG_HOT_UPDATED_ITEM(disk_reject_create_chunk_threshold, 0.98);
};
CheckWorker(const Config &config, Components &components);
// start check worker.
Result<Void> start(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers);
// stop check worker. End all tasks immediately.
Result<Void> stopAndJoin();
// check hf3fs path writable or not.
static bool checkWritable(const Path &path);
protected:
void loop(const std::vector<Path> &targetPaths, const std::vector<std::string> &manufacturers);
private:
ConstructLog<"storage::CheckWorker"> constructLog_;
const Config &config_;
Components &components_;
folly::CPUThreadPoolExecutor executors_;
std::mutex mutex_;
std::condition_variable cond_;
std::atomic<bool> stopping_ = false;
std::atomic<bool> started_ = false;
std::atomic<bool> stopped_ = false;
};
} // namespace hf3fs::storage

View File

@@ -0,0 +1,178 @@
#include "storage/worker/DumpWorker.h"
#include <gperftools/profiler.h>
#include <memory>
#include <sys/times.h>
#include "common/monitor/Recorder.h"
#include "common/utils/Duration.h"
#include "common/utils/Result.h"
#include "common/utils/UtcTime.h"
#include "fbs/mgmtd/MgmtdTypes.h"
#include "storage/service/Components.h"
namespace hf3fs::storage {
namespace {
monitor::ValueRecorder cpuCores{"storage.sys.cpu_cores", std::nullopt, true};
}
DumpWorker::DumpWorker(const Config &config, Components &components)
: config_(config),
components_(components),
executors_(std::make_pair(1u, 1u), std::make_shared<folly::NamedThreadFactory>("Dump")) {}
// start dump worker.
Result<Void> DumpWorker::start(flat::NodeId id) {
executors_.add([this] { loop(); });
started_ = true;
nodeId_ = id;
return Void{};
}
// stop dump worker. End all tasks immediately.
Result<Void> DumpWorker::stopAndJoin() {
stopping_ = true;
cond_.notify_one();
for (int i = 0; started_ && !stopped_; ++i) {
XLOGF_IF(INFO, i % 5 == 0, "Waiting for DumpWorker@{}::loop stop...", fmt::ptr(this));
std::this_thread::sleep_for(100_ms);
}
executors_.join();
return Void{};
}
void DumpWorker::loop() {
RelativeTime lastDumpTime = RelativeTime::now();
struct tms last_tms {};
struct tms cur_tms {};
auto last_tck = times(&last_tms);
bool profiler = false;
RelativeTime lastProfilerTime = RelativeTime::now();
while (!stopping_) {
auto lock = std::unique_lock(mutex_);
if (cond_.wait_for(lock, 1000_ms, [&] { return stopping_.load(); })) {
break;
}
auto cur_tck = times(&cur_tms);
if (last_tck < cur_tck && last_tms.tms_stime <= cur_tms.tms_stime && last_tms.tms_utime <= cur_tms.tms_utime) {
auto elapsed = cur_tck - last_tck;
auto usage = (cur_tms.tms_stime - last_tms.tms_stime) + (cur_tms.tms_utime - last_tms.tms_utime);
auto cores = usage / elapsed;
cpuCores.set(cores);
if (!profiler && cores >= config_.high_cpu_usage_threshold()) {
profiler = true;
lastProfilerTime = RelativeTime::now();
profilerStart(config_.dump_root_path());
}
}
last_tck = cur_tck;
last_tms = cur_tms;
if (profiler && RelativeTime::now() - lastProfilerTime >= 1_min) {
ProfilerStop();
profiler = false;
}
// 1. dump all targets.
auto now = RelativeTime::now();
if (now - lastDumpTime >= config_.dump_interval()) {
auto rootPath = config_.dump_root_path();
if (rootPath.empty()) {
continue;
}
dump(rootPath);
lastDumpTime = now;
last_tck = times(&last_tms);
}
}
stopped_ = true;
XLOGF(INFO, "DumpWorker@{}::loop stopped", fmt::ptr(this));
}
Result<Void> DumpWorker::dump(const Path &rootPath) {
auto hostname = SysResource::hostname().value_or("unknown");
auto dumpPath = rootPath / fmt::format("{:%F}", fmt::localtime(std::time(nullptr)));
boost::system::error_code ec{};
boost::filesystem::create_directories(dumpPath, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("dump meta create directory {} failed: {}", dumpPath, ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
std::map<std::string, std::weak_ptr<StorageTarget>> targets;
{
auto targetMap = components_.targetMap.snapshot();
for (auto &[targetId, target] : targetMap->getTargets()) {
if (target.localState != flat::LocalTargetState::OFFLINE && target.storageTarget != nullptr) {
auto dumpFileName = fmt::format("{}.{}.{}.{}",
target.vChainId.chainId.toUnderType(),
target.vChainId.chainVer.toUnderType(),
targetId.toUnderType(),
hostname);
targets[dumpFileName] = target.storageTarget;
}
}
}
for (auto &[name, weakTarget] : targets) {
if (stopping_) {
break;
}
auto target = weakTarget.lock();
if (!target) {
continue;
}
auto dumpFilePath = dumpPath / name;
std::unordered_map<ChunkId, ChunkMetadata> metas;
auto dumpResult = target->getAllMetadataMap(metas);
if (UNLIKELY(!dumpResult)) {
XLOGF(ERR, "dump meta {} failed: {}", dumpFilePath, dumpResult.error());
return makeError(std::move(dumpResult.error()));
}
target = nullptr;
std::ofstream dumpFile{dumpFilePath};
if (UNLIKELY(!dumpFile)) {
auto msg = fmt::format("dump meta create file failed: {}", dumpFilePath);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
auto bytes = serde::serializeBytes(metas);
auto view = std::string_view{bytes};
dumpFile.write(view.data(), view.size());
if (UNLIKELY(!dumpFile)) {
auto msg = fmt::format("dump meta write file failed: {}", dumpFilePath);
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
}
return Void{};
}
Result<Void> DumpWorker::profilerStart(const Path &rootPath) {
auto dumpPath = rootPath / fmt::format("{:%F}", fmt::localtime(std::time(nullptr)));
boost::system::error_code ec{};
boost::filesystem::create_directories(dumpPath, ec);
if (UNLIKELY(ec.failed())) {
auto msg = fmt::format("dump meta create directory {} failed: {}", dumpPath, ec.message());
XLOG(ERR, msg);
return makeError(StorageCode::kStorageInitFailed, std::move(msg));
}
auto dumpFile = dumpPath / fmt::format("{}.{:%T}.perf", nodeId_.toUnderType(), fmt::localtime(std::time(nullptr)));
ProfilerStart(dumpFile.c_str());
return Void{};
}
} // namespace hf3fs::storage

Some files were not shown because too many files have changed in this diff Show More