Initial commit

2025-06-26 18:16:45 +00:00 · 2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -0,0 +1,2 @@
+[build]
+rustflags = ["-Clinker=clang++-14", "-Clink-arg=-fuse-ld=lld"]
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,29 @@
+BasedOnStyle: Google
+Language: Cpp
+Standard: c++20
+AlignAfterOpenBracket: Align
+AlignEscapedNewlines: Left
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: false 
+AllowAllParametersOfDeclarationOnNextLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BitFieldColonSpacing: Both
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+IndentWidth: 2
+PointerAlignment: Right
+DerivePointerAlignment: false
+ColumnLimit: 120
+PackConstructorInitializers: Never
+SpaceAfterTemplateKeyword: true
+
+IncludeCategories:
+  - Regex: 'Flat_generated\.h("|>)'
+    Priority: 1000
+    SortPriority: 1000
+  - Regex: '"fbs/macros/'
+    Priority: 1001
+    SortPriority: 1001
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -0,0 +1,117 @@
+Checks: >
+  -*,
+  bugprone-undelegated-constructor,
+  bugprone-argument-comment,
+  bugprone-bad-signal-to-kill-thread,
+  bugprone-bool-pointer-implicit-conversion,
+  bugprone-copy-constructor-init,
+  bugprone-dangling-handle,
+  bugprone-forward-declaration-namespace,
+  bugprone-fold-init-type,
+  bugprone-inaccurate-erase,
+  bugprone-incorrect-roundings,
+  bugprone-infinite-loop,
+  bugprone-integer-division,
+  bugprone-macro-repeated-side-effects,
+  bugprone-misplaced-operator-in-strlen-in-alloc,
+  bugprone-misplaced-pointer-artithmetic-in-alloc,
+  bugprone-misplaced-widening-cast,
+  bugprone-move-forwarding-reference,
+  bugprone-multiple-statement-macro,
+  bugprone-parent-virtual-call,
+  bugprone-posix-return,
+  bugprone-reserved-identifier,
+  bugprone-signed-char-misuse,
+  bugprone-sizeof-container,
+  bugprone-sizeof-expression,
+  bugprone-string-constructor,
+  bugprone-string-integer-assignment,
+  bugprone-string-literal-with-embedded-nul,
+  bugprone-suspicious-enum-usage,
+  bugprone-suspicious-include,
+  bugprone-suspicious-memset-usage,
+  bugprone-suspicious-missing-comma,
+  bugprone-suspicious-string-compare,
+  bugprone-swapped-arguments,
+  bugprone-terminating-continue,
+  bugprone-throw-keyword-missing,
+  bugprone-too-small-loop-variable,
+  bugprone-undefined-memory-manipulation,
+  bugprone-unhandled-self-assignment,
+  bugprone-unused-raii,
+  bugprone-unused-return-value,
+  bugprone-use-after-move,
+  bugprone-virtual-near-miss,
+  # bugprone-macro-parentheses,
+  # bugprone-narrowing-conversions,
+  # bugprone-exception-escape,
+
+  performance-faster-string-find,
+  performance-for-range-copy,
+  performance-implicit-conversion-in-loop,
+  performance-inefficient-algorithm,
+  performance-inefficient-vector-operation,
+  performance-move-constructor-init,
+  performance-no-automatic-move,
+  performance-trivially-destructible,
+  performance-unnecessary-copy-initialization,
+  performance-move-const-arg,
+
+  modernize-avoid-bind,
+  modernize-loop-convert,
+  modernize-make-shared,
+  modernize-make-unique,
+  modernize-raw-string-literal,
+  modernize-redundant-void-arg,
+  modernize-replace-auto-ptr,
+  modernize-replace-random-shuffle,
+  modernize-use-auto,
+  modernize-use-bool-literals,
+  modernize-use-nullptr,
+  modernize-use-using,
+  modernize-use-override,
+  modernize-use-equals-default,
+  modernize-use-equals-delete,
+
+  misc-throw-by-value-catch-by-reference,
+  misc-misplaced-const,
+  misc-unconventional-assign-operator,
+  misc-redundant-expression,
+  misc-static-assert,
+  misc-unconventional-assign-operator,
+  misc-uniqueptr-reset-release,
+  misc-unused-alias-decls,
+  misc-unused-parameters,
+  misc-unused-using-decls,
+
+  readability-avoid-const-params-in-decls
+  readability-const-return-type,
+  readability-container-size-empty,
+  readability-delete-null-pointer,
+  readability-deleted-default,
+  readability-misplaced-array-index,
+  readability-non-const-parameter,
+  readability-redundant-control-flow,
+  readability-redundant-function-ptr-dereference,
+  readability-redundant-smartptr-get,
+  readability-redundant-string-cstr,
+  readability-redundant-string-init,
+  readability-static-definition-in-anonymous-namespace,
+  readability-string-compare,
+  readability-uniqueptr-delete-release,
+  readability-simplify-subscript-expr,
+  readability-simplify-boolean-expr,
+  readability-inconsistent-declaration-parameter-name,
+  # readability-qualified-auto,
+
+  cert-flp30-c,
+  cert-mem57-cpp,
+  cert-oop58-cpp,
+  google-build-explicit-make-pair,
+  google-runtime-operator,
+  hicpp-exception-baseclass,
+  cppcoreguidelines-virtual-class-destructor,
+WarningsAsErrors: ""
+CheckOptions:
+  - key: performance-move-const-arg.CheckTriviallyCopyableMove
+    value: false
--- a/.clangd
+++ b/.clangd
@@ -0,0 +1,3 @@
+CompileFlags:
+  Add: -fcoroutines-ts
+  Remove: [-fcoroutines, -DNO_INTELLISENSE]
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,3 @@
+clang
+build
+deploy/*/Dockerfile
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -0,0 +1,36 @@
+name: Build
+
+on:
+  push:
+    branches: [ "main" ]
+
+jobs:
+  build:
+    runs-on: self-hosted  # or `ubuntu-22.04`
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure sccache-cache
+      run: |
+        echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
+        echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
+
+    - name: Run sccache-cache
+      uses: mozilla-actions/sccache-action@v0.0.4
+
+    - name: Prepare
+      run: |
+        sudo apt install -y cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev
+        sudo apt install -y libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14
+        sudo apt install -y libgoogle-perftools-dev google-perftools libssl-dev ccache gcc-12 g++-12 libboost-all-dev meson rustc cargo
+        wget https://github.com/apple/foundationdb/releases/download/7.1.61/foundationdb-clients_7.1.61-1_amd64.deb && sudo dpkg -i foundationdb-clients_7.1.61-1_amd64.deb
+        git clone https://github.com/libfuse/libfuse.git libfuse -b fuse-3.16.2 --depth 1 && mkdir libfuse/build && cd libfuse/build && meson setup .. && ninja && sudo ninja install && cd ../.. && rm -rf libfuse
+        git submodule update --init --recursive
+        ./patches/apply.sh
+
+    - name: Build
+      run: |
+        cargo build --release
+        cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_C_COMPILER=clang-14 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        cmake --build build -j 32
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,17 @@
+/.cache
+/.vscode
+/build
+/debug
+/release
+/clang
+/clangdbg
+/packages
+*.o
+*.a
+*.so
+*.log
+compile_commands.json
+*_generated.h
+*~
+__pycache__
+/target
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,42 @@
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
+[submodule "third_party/folly"]
+	path = third_party/folly
+	url = https://github.com/facebook/folly.git
+[submodule "third_party/leveldb"]
+	path = third_party/leveldb
+	url = https://github.com/google/leveldb.git
+[submodule "third_party/rocksdb"]
+	path = third_party/rocksdb
+	url = https://github.com/facebook/rocksdb.git
+[submodule "third_party/scnlib"]
+	path = third_party/scnlib
+	url = https://github.com/eliaskosunen/scnlib.git
+[submodule "third_party/pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/clickhouse-cpp"]
+	path = third_party/clickhouse-cpp
+	url = https://github.com/ClickHouse/clickhouse-cpp.git
+[submodule "third_party/fmt"]
+	path = third_party/fmt
+	url = https://github.com/fmtlib/fmt.git
+[submodule "third_party/toml11"]
+	path = third_party/toml11
+	url = https://github.com/ToruNiina/toml11.git
+[submodule "third_party/jemalloc"]
+	path = third_party/jemalloc
+	url = https://github.com/jemalloc/jemalloc.git
+[submodule "third_party/mimalloc"]
+	path = third_party/mimalloc
+	url = https://github.com/microsoft/mimalloc.git
+[submodule "third_party/zstd"]
+	path = third_party/zstd
+	url = https://github.com/facebook/zstd.git
+[submodule "third_party/liburing"]
+	path = third_party/liburing
+	url = https://github.com/axboe/liburing.git
+[submodule "third_party/gtest-parallel"]
+	path = third_party/gtest-parallel
+	url = https://github.com/google/gtest-parallel.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,172 @@
+cmake_minimum_required(VERSION 3.12)
+project(3FS VERSION 0.1.5 LANGUAGES C CXX)
+
+set(CMAKE_CONFIGURATION_TYPES "RelWithDebInfo;Debug;Release;MinSizeRel" CACHE STRING "" FORCE)
+if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None")
+    set (CMAKE_BUILD_TYPE "RelWithDebInfo")
+    message (STATUS "CMAKE_BUILD_TYPE is not set, set to default = ${CMAKE_BUILD_TYPE}")
+endif ()
+message (STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
+
+string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+
+if(CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" )
+    option(ENABLE_ASSERTIONS "Enable assertions" ON)
+else()
+    option(ENABLE_ASSERTIONS "Enable assertions" OFF)
+endif()
+message (STATUS "ENABLE_ASSERTIONS: ${ENABLE_ASSERTIONS}")
+
+if(ENABLE_ASSERTIONS)
+    add_definitions(-D_DEBUG)
+    # On non-Debug builds cmake automatically defines NDEBUG, so we explicitly undefine it:
+    if(NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
+        # NOTE: use `add_compile_options` rather than `add_definitions` since
+        # `add_definitions` does not support generator expressions.
+        add_compile_options($<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:-UNDEBUG>)
+    endif()
+endif()
+
+option(OVERRIDE_CXX_NEW_DELETE "Override C++ new/delete operator" OFF)
+option(SAVE_ALLOCATE_SIZE "Use more memory to save allocate size" OFF)
+
+option(ENABLE_FUSE_APPLICATION "" ON)
+
+if (DEFINED SANITIZER AND SANITIZER)
+    set(OVERRIDE_CXX_NEW_DELETE OFF)
+endif()
+message (STATUS "OVERRIDE_CXX_NEW_DELETE: ${OVERRIDE_CXX_NEW_DELETE}")
+
+if (OVERRIDE_CXX_NEW_DELETE)
+    add_definitions(-DOVERRIDE_CXX_NEW_DELETE)
+    if (SAVE_ALLOCATE_SIZE)
+        add_definitions(-DSAVE_ALLOCATE_SIZE)
+    endif()
+endif()
+message (STATUS "SAVE_ALLOCATE_SIZE: ${SAVE_ALLOCATE_SIZE}")
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED OFF)
+set(CMAKE_C_EXTENSIONS ON)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcoroutines-ts")
+    set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -latomic")
+    add_link_options(-fuse-ld=lld)
+    # Do not build with libc++ (LLVM's implementation of the C++ standard library) in fdb
+    set(USE_LIBCXX OFF)
+elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcoroutines")
+endif()
+
+# Remove project root from the __FILE__ macro variable
+add_compile_options(-fmacro-prefix-map=${CMAKE_SOURCE_DIR}=.)
+
+add_compile_options(-msse4.2 -mavx2)
+add_compile_definitions(ROCKSDB_NAMESPACE=rocksdb_internal)
+include(cmake/Sanitizers.cmake)
+include(cmake/CompileFlags.cmake)
+
+# folly can't work normally under -Werror
+store_compile_flags()
+add_subdirectory("third_party/fmt" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set(ZSTD_BUILD_STATIC ON)
+add_subdirectory("third_party/zstd/build/cmake" EXCLUDE_FROM_ALL)
+set(zstd_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/third_party/zstd/lib")
+set(ZSTD_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/third_party/zstd/lib")
+set(zstd_FOUND ON)
+set(ZSTD_FOUND ON)
+set(zstd_LIBRARIES "${PROJECT_BINARY_DIR}/third_party/zstd/build/cmake/lib/libzstd.a")
+set(ZSTD_LIBRARY "${PROJECT_BINARY_DIR}/third_party/zstd/build/cmake/lib/libzstd.a")
+restore_compile_flags()
+add_subdirectory("third_party/googletest" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set(FOLLY_NO_EXCEPTION_TRACER ON)
+add_subdirectory("third_party/folly" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set(LEVELDB_BUILD_TESTS OFF CACHE BOOL "Disable LevelDB tests")
+set(LEVELDB_BUILD_BENCHMARKS OFF CACHE BOOL "Disable LevelDB benchmarks")
+set(LEVELDB_INSTALL OFF CACHE BOOL "Disable LevelDB install")
+add_subdirectory("third_party/leveldb" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+set(WITH_LZ4 ON)
+set(WITH_ZSTD ON)
+set(USE_RTTI ON)
+set(WITH_TESTS OFF)
+set(WITH_BENCHMARK_TOOLS OFF)
+set(WITH_TOOLS OFF)
+set(WITH_ALL_TESTS OFF)
+
+add_subdirectory("third_party/rocksdb" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set(SCN_TESTS OFF)
+set(SCN_EXAMPLES OFF)
+set(SCN_BENCHMARKS OFF)
+set(SCN_DOCS OFF)
+set(SCN_INSTALL OFF)
+set(SCN_PEDANTIC OFF)
+add_subdirectory("third_party/scnlib" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+add_subdirectory("third_party/pybind11" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+add_subdirectory("third_party/toml11" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+set (MI_OVERRIDE OFF)
+add_subdirectory("third_party/mimalloc" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+add_subdirectory("third_party/clickhouse-cpp" EXCLUDE_FROM_ALL)
+TARGET_INCLUDE_DIRECTORIES(clickhouse-cpp-lib
+    PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp
+)
+TARGET_INCLUDE_DIRECTORIES(clickhouse-cpp-lib-static
+    PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp
+)
+TARGET_INCLUDE_DIRECTORIES (absl-lib
+    PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp/contrib
+)
+restore_compile_flags()
+add_subdirectory("third_party/liburing-cmake" EXCLUDE_FROM_ALL)
+restore_compile_flags()
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wpedantic")
+
+set (CMAKE_CXX_FLAGS                     "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS}")
+set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
+set (CMAKE_CXX_FLAGS_DEBUG               "${CMAKE_CXX_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_CXX_FLAGS_ADD}")
+
+set (CMAKE_C_FLAGS                       "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${CMAKE_C_FLAGS_ADD}")
+set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
+set (CMAKE_C_FLAGS_DEBUG                 "${CMAKE_C_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_C_FLAGS_ADD}")
+
+set (CMAKE_ASM_FLAGS                     "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
+set (CMAKE_ASM_FLAGS_RELWITHDEBINFO      "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
+set (CMAKE_ASM_FLAGS_DEBUG               "${CMAKE_ASM_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_ASM_FLAGS_ADD}")
+
+set(Boost_USE_STATIC_LIBS ON)
+set(FDB_VERSION 7.1.5-ibe)
+
+find_package(Threads REQUIRED)
+find_package(Boost REQUIRED COMPONENTS filesystem system program_options)
+find_library(LIBUV_LIBRARY NAMES libuv1)
+enable_testing()
+
+include(cmake/CodeCoverage.cmake)
+include(cmake/CLangFormat.cmake)
+include(cmake/CLangTidy.cmake)
+include(cmake/Target.cmake)
+include(cmake/DumpConfig.cmake)
+include(cmake/Jemalloc.cmake)
+include(cmake/ApacheArrow.cmake)
+include(cmake/AddCrate.cmake)
+configure_file(cmake/CTestCustom.cmake ${CMAKE_BINARY_DIR} @ONLY)
+
+add_subdirectory(src)
+add_subdirectory(tests)
+add_subdirectory(benchmarks)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,16 @@
+[workspace]
+members = [
+  "src/client/trash_cleaner",
+  "src/storage/chunk_engine"
+]
+resolver = "2"
+
+[workspace.package]
+authors = ["dev <noreply@deepseek.com>"]
+edition = "2021"
+license = "MIT"
+
+[profile.release-cmake]
+debug = true
+inherits = "release"
+lto = true
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 DeepSeek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,101 @@
+#  Fire-Flyer File System
+
+[![Build](https://github.com/deepseek-ai/3fs/actions/workflows/build.yml/badge.svg)](https://github.com/deepseek-ai/3fs/actions/workflows/build.yml)
+[![License](https://img.shields.io/badge/LICENSE-MIT-blue.svg)](LICENSE)
+
+The Fire-Flyer File System (3FS) is a high-performance distributed file system designed to address the challenges of AI training and inference workloads. It leverages modern SSDs and RDMA networks to provide a shared storage layer that simplifies development of distributed applications. Key features and benefits of 3FS include:
+
+- Performance and Usability
+  - **Disaggregated Architecture** Combines the throughput of thousands of SSDs and the network bandwidth of hundreds of storage nodes, enabling applications to access storage resource in a locality-oblivious manner.
+  - **Strong Consistency** Implements Chain Replication with Apportioned Queries (CRAQ) for strong consistency, making application code simple and easy to reason about.
+  - **File Interfaces** Develops stateless metadata services backed by a transactional key-value store (e.g., FoundationDB). The file interface is well known and used everywhere. There is no need to learn a new storage API.
+
+- Diverse Workloads
+  - **Data Preparation** Organizes outputs of data analytics pipelines into hierarchical directory structures and manages large volume of intermediate outputs efficiently.
+  - **Dataloaders** Eliminates the need for prefetching or shuffling datasets by enabling random access to training samples across compute nodes.
+  - **Checkpointing** Supports high-throughput parallel checkpointing for large-scale training.
+  - **KVCache for Inference** Provides a cost-effective alternative to DRAM-based caching, offering high throughput and significantly larger capacity.
+
+## Documentation
+
+* [Design Notes](docs/design_notes.md)
+* [Setup Guide](deploy/README.md)
+* [USRBIO API Reference](src/lib/api/UsrbIo.md)
+* [P Specifications](./specs/README.md)
+
+## Performance
+
+### 1. Peak throughput
+
+The following figure demonstrates the throughput of read stress test on a large 3FS cluster. This cluster consists of 180 storage nodes, each equipped with 2×200Gbps InfiniBand NICs and sixteen 14TiB NVMe SSDs. Approximately 500+ client nodes were used for the read stress test, with each client node configured with 1x200Gbps InfiniBand NIC. The final aggregate read throughput reached approximately 6.6 TiB/s with background traffic from training jobs.
+
+![Large block read throughput under stress test on a 180-node cluster](docs/images/peak_throughput.jpg)
+
+### 2. GraySort
+
+We evaluated [smallpond](https://github.com/deepseek-ai/smallpond) using the GraySort benchmark, which measures sort performance on large-scale datasets. Our implementation adopts a two-phase approach: (1) partitioning data via shuffle using the prefix bits of keys, and (2) in-partition sorting. Both phases read/write data from/to 3FS.
+
+The test cluster comprised 25 storage nodes (2 NUMA domains/node, 1 storage service/NUMA, 2×400Gbps NICs/node) and 50 compute nodes (2 NUMA domains, 192 physical cores, 2.2 TiB RAM, and 1×200 Gbps NIC/node). Sorting 110.5 TiB of data across 8,192 partitions completed in 30 minutes and 14 seconds, achieving an average throughput of *3.66 TiB/min*.
+
+![](docs/images/gray_sort_server.png)
+![](docs/images/gray_sort_client.png)
+
+### 3. KVCache
+
+KVCache is a technique used to optimize the LLM inference process. It avoids redundant computations by caching the key and value vectors of previous tokens in the decoder layers.
+The top figure demonstrates the read throughput of all KVCache clients, highlighting both peak and average values, with peak throughput reaching up to 40 GiB/s. The bottom figure presents the IOPS of remove ops from garbage collection (GC) during the same time period.
+
+![KVCache Read Throughput](./docs/images/kvcache_read_throughput.png)
+![KVCache GC IOPS](./docs/images/kvcache_gc_iops.png)
+
+## Check out source code
+
+Clone 3FS repository from github:
+
+	git clone https://github.com/deepseek-ai/3fs
+
+When `deepseek-ai/3fs` has been cloned to local file system, run the
+following commands to check out the submodules:
+
+```bash
+cd 3fs
+git submodule update --init --recursive
+./patches/apply.sh
+```
+
+## Install dependencies
+
+Install dependencies:
+
+```bash
+# for Ubuntu 20.04.
+apt install cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev \
+  libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14 \
+  libgoogle-perftools-dev google-perftools libssl-dev ccache libclang-rt-14-dev gcc-10 g++-10 libboost1.71-all-dev
+
+# for Ubuntu 22.04.
+apt install cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev \
+  libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14 \
+  libgoogle-perftools-dev google-perftools libssl-dev ccache gcc-12 g++-12 libboost-all-dev
+```
+
+Install other build prerequisites:
+
+- [`libfuse`](https://github.com/libfuse/libfuse/releases/tag/fuse-3.16.1) 3.16.1 or newer version
+- [FoundationDB](https://apple.github.io/foundationdb/getting-started-linux.html) 7.1 or newer version
+- [Rust](https://www.rust-lang.org/tools/install) toolchain
+
+## Build 3FS
+
+Build 3FS in `build` folder:
+
+    cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_C_COMPILER=clang-14 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    cmake --build build -j 32
+
+## Run a test cluster
+
+Follow instructions in [setup guide](deploy/README.md) to run a test cluster.
+
+## Report Issues
+
+Please visit https://github.com/deepseek-ai/3fs/issues to report issues.
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(storage_bench)
--- a/benchmarks/storage_bench/CMakeLists.txt
+++ b/benchmarks/storage_bench/CMakeLists.txt
@@ -0,0 +1 @@
+target_add_bin(storage_bench "StorageBench.cc" test-fabric-lib storage-client storage memory-common follybenchmark gmock fdb mgmtd)
--- a/benchmarks/storage_bench/StorageBench.cc
+++ b/benchmarks/storage_bench/StorageBench.cc
@@ -0,0 +1,291 @@
+#include "StorageBench.h"
+
+#include <folly/init/Init.h>
+
+#include "common/monitor/Monitor.h"
+#include "memory/common/OverrideCppNewDelete.h"
+
+DEFINE_bool(benchmarkNetwork, false, "Run in network benchmark mode");
+DEFINE_bool(benchmarkStorage, false, "Run in storage benchmark mode");
+DEFINE_bool(ignoreIOError, false, "Ignore all IO errors");
+DEFINE_bool(injectRandomServerError, false, "Inject random server errors");
+DEFINE_bool(injectRandomClientError, false, "Inject random client errors");
+DEFINE_bool(retryPermanentError, false, "Retry requests with permanent errors");
+DEFINE_bool(verifyReadData, false, "Check if the read data are correct");
+DEFINE_bool(verifyReadChecksum, false, "Verify the checksum of read IOs");
+DEFINE_bool(verifyWriteChecksum, true, "Verify the checksum of write IOs");
+DEFINE_bool(randomShuffleChunkIds, false, "Random shuffle generated chunk IDs");
+DEFINE_bool(generateTestData, true, "Generate test data for read test");
+DEFINE_bool(sparseChunkIds, false, "Generate sparse chunk IDs");
+DEFINE_bool(truncateChunks, false, "Truncate chunks");
+DEFINE_bool(cleanupChunks, false, "Clean up (remove) chunks after benchmark");
+DEFINE_bool(cleanupChunksBeforeBench, false, "Clean up (remove) chunks before benchmark");
+DEFINE_bool(serverMode, false, "Run in server mode");
+DEFINE_bool(clientMode, false, "Run in client mode");
+DEFINE_bool(clusterMode, false, "Run in cluster mode (get routing info from mgmtd)");
+DEFINE_bool(printMetrics, false, "Enable printing metrics in logs");
+DEFINE_bool(reportMetrics, false, "Enable reporting metrics to ClickHouse");
+DEFINE_uint32(metaStoreType, 0, "Metadata store type (0=LevelDB, 1=RocksDB, 2=MemDB)");
+DEFINE_uint32(chunkSizeKB, 512, "Chunk size (KB)");
+DEFINE_uint32(chainTableId, 0, "Chain table id");
+DEFINE_uint32(chainTableVersion, 0, "Chain table version");
+DEFINE_string(chainIds, "", "List of chain ids");
+DEFINE_string(storageNodeIds, "", "List of storage node ids");
+DEFINE_uint32(numChains, 1, "Number of chains");
+DEFINE_uint32(numReplicas, 1, "Number of replicas");
+DEFINE_uint32(numStorageNodes, 1, "Number of storage nodes");
+DEFINE_uint32(numChunks, 1, "Number of chunks");
+DEFINE_uint32(readSize, 4096, "Read IO size");
+DEFINE_uint32(writeSize, 131072, "Write IO size");
+DEFINE_uint32(memoryAlignment, 1, "Alignment size of each IO buffer");
+DEFINE_uint32(readOffAlignment, 0, "Alignment size of each read IO offset");
+DEFINE_uint32(batchSize, 1, "Read/write batch size");
+DEFINE_uint32(readBatchSize, 0, "Read batch size");
+DEFINE_uint32(writeBatchSize, 0, "Write batch size");
+DEFINE_uint32(removeBatchSize, 0, "Remove batch size");
+DEFINE_uint32(numReadSecs, 0, "Read test duration");
+DEFINE_uint32(numWriteSecs, 0, "Write test duration");
+DEFINE_uint32(numCoroutines, 1, "Number of coroutines");
+DEFINE_uint32(numTestThreads, 1, "Number of test threads");
+DEFINE_uint32(randSeed, 0, "Random seed for chunk id generation");
+DEFINE_uint32(chunkIdPrefix, 0xFFFF, "The most significant 2 bytes of chunk ids");
+DEFINE_uint32(serviceLevel, 0, "Service level");
+DEFINE_uint32(listenPort, 0, "Listen port");
+DEFINE_uint32(clientTimeoutMS, 30000, "Client timeout (milliseconds)");
+DEFINE_string(dataPaths, folly::fs::temp_directory_path().string(), "Comma or space separated list of paths");
+DEFINE_string(clientConfig, "", "Path of client config");
+DEFINE_string(serverConfig, "", "Path of server config");
+DEFINE_string(statsFilePath, "./perfstats.csv", "Path of performance stats file");
+DEFINE_string(ibvDevices, "mlx5_0,mlx5_1", "Comma or space separated list of ibv devices");
+DEFINE_string(ibnetZones, "", "Comma or space separated list of IB network zones");
+DEFINE_string(clusterId, "stage", "Cluster id used to connect to mgmtd");
+DEFINE_string(mgmtdEndpoints,
+              "",
+              "Comma or space separated list of mgmtd endpoints, "
+              "e.g. 'RDMA://10.1.1.1:1234,RDMA://10.1.1.2:1234'");
+DEFINE_string(storageEndpoints,
+              "",
+              "Comma or space separated list of storage ids and endpoints, "
+              "e.g. '1@RDMA://10.1.1.1:1234,2@RDMA://10.1.1.2:1234'");
+DEFINE_string(monitorEndpoint, "", "Monitor endpoint");
+DEFINE_uint32(defaultPKeyIndex, 1, "IB default pkey index");
+
+namespace hf3fs::storage::benchmark {
+
+using namespace hf3fs::storage::client;
+
+std::vector<uint32_t> stringToIntVec(const std::string &str) {
+  std::vector<uint32_t> vec;
+  std::vector<std::string> substrs;
+  boost::split(substrs, str, boost::is_any_of(", "));
+
+  for (auto s : substrs) {
+    boost::trim(s);
+    if (s.empty()) continue;
+
+    uint32_t n = std::stoul(s);
+    vec.push_back(n);
+  }
+
+  return vec;
+}
+
+bool runBenchmarks() {
+  std::vector<std::string> dataPathStrs;
+  boost::split(dataPathStrs, FLAGS_dataPaths, boost::is_any_of(", "));
+
+  std::vector<hf3fs::Path> dataPaths;
+  dataPaths.reserve(dataPathStrs.size());
+
+  for (auto str : dataPathStrs) {
+    boost::trim(str);
+    if (str.empty()) continue;
+
+    dataPaths.emplace_back(str);
+  }
+
+  std::vector<std::string> endpointRawStrs;
+  boost::split(endpointRawStrs, FLAGS_storageEndpoints, boost::is_any_of(", "));
+
+  std::map<NodeId, net::Address> storageEndpoints;
+
+  for (auto str : endpointRawStrs) {
+    boost::trim(str);
+    if (str.empty()) continue;
+
+    std::vector<std::string> nodeEndpointStrs;
+    boost::split(nodeEndpointStrs, str, boost::is_any_of("@"));
+
+    if (nodeEndpointStrs.size() != 2) {
+      XLOGF(ERR, "Wrong node endpoint string: {}", str);
+      return false;
+    }
+
+    auto nodeIdStr = nodeEndpointStrs[0];
+    auto endpointStr = nodeEndpointStrs[1];
+
+    NodeId nodeId{std::stoul(nodeIdStr)};
+    auto endpoint = net::Address::fromString(endpointStr);
+    storageEndpoints[nodeId] = endpoint;
+    XLOGF(WARN, "Add storage endpoint: {} @ {}", nodeId, endpoint);
+  }
+
+  if (FLAGS_clientMode && storageEndpoints.empty()) {
+    XLOGF(ERR, "No storage endpoint specified for client mode");
+    return false;
+  }
+
+  if (FLAGS_readSize > FLAGS_chunkSizeKB * 1024) {
+    XLOGF(ERR, "Read size {} is greater than chunk size {}", FLAGS_readSize, FLAGS_chunkSizeKB * 1024);
+    return false;
+  }
+
+  auto metaStoreType = static_cast<kv::KVStore::Type>(FLAGS_metaStoreType);
+
+  test::SystemSetupConfig setupConfig = {
+      FLAGS_chunkSizeKB * 1024 /*chunkSize*/,
+      FLAGS_numChains /*numChains*/,
+      FLAGS_numReplicas /*numReplicas*/,
+      FLAGS_numStorageNodes /*numStorageNodes*/,
+      dataPaths /*dataPaths*/,
+      FLAGS_clientConfig,
+      FLAGS_serverConfig,
+      storageEndpoints,
+      FLAGS_serviceLevel,
+      FLAGS_listenPort,
+      StorageClient::ImplementationType::RPC /*clientImplType*/,
+      metaStoreType,
+      true /*useFakeMgmtdClient*/,
+      !FLAGS_clientMode /*startStorageServer*/,
+      false,
+  };
+
+  std::vector<std::string> ibvDevices;
+  boost::split(ibvDevices, FLAGS_ibvDevices, boost::is_any_of(", "));
+
+  std::vector<std::string> ibnetZones;
+  boost::split(ibnetZones, FLAGS_ibnetZones, boost::is_any_of(", "));
+
+  endpointRawStrs.clear();
+  boost::split(endpointRawStrs, FLAGS_mgmtdEndpoints, boost::is_any_of(", "));
+
+  std::vector<net::Address> mgmtdEndpoints;
+
+  for (auto str : endpointRawStrs) {
+    boost::trim(str);
+    if (str.empty()) continue;
+
+    auto endpoint = net::Address::fromString(str);
+    mgmtdEndpoints.push_back(endpoint);
+    XLOGF(WARN, "Add mgmtd endpoint: {}", endpoint);
+  }
+
+  StorageBench::Options benchOptions{FLAGS_numChunks,
+                                     FLAGS_readSize,
+                                     FLAGS_writeSize,
+                                     FLAGS_batchSize,
+                                     FLAGS_numReadSecs,
+                                     FLAGS_numWriteSecs,
+                                     FLAGS_clientTimeoutMS,
+                                     FLAGS_numCoroutines,
+                                     FLAGS_numTestThreads,
+                                     FLAGS_randSeed,
+                                     (uint16_t)FLAGS_chunkIdPrefix,
+                                     FLAGS_benchmarkNetwork,
+                                     FLAGS_benchmarkStorage,
+                                     FLAGS_ignoreIOError,
+                                     FLAGS_injectRandomServerError,
+                                     FLAGS_injectRandomClientError,
+                                     FLAGS_retryPermanentError,
+                                     FLAGS_verifyReadData,
+                                     FLAGS_verifyReadChecksum,
+                                     FLAGS_verifyWriteChecksum,
+                                     FLAGS_randomShuffleChunkIds,
+                                     FLAGS_generateTestData,
+                                     FLAGS_sparseChunkIds,
+                                     FLAGS_statsFilePath,
+                                     ibvDevices,
+                                     ibnetZones,
+                                     mgmtdEndpoints,
+                                     FLAGS_clusterId,
+                                     FLAGS_chainTableId,
+                                     FLAGS_chainTableVersion,
+                                     stringToIntVec(FLAGS_chainIds),
+                                     stringToIntVec(FLAGS_storageNodeIds),
+                                     FLAGS_memoryAlignment,
+                                     FLAGS_readOffAlignment,
+                                     FLAGS_defaultPKeyIndex,
+                                     FLAGS_readBatchSize,
+                                     FLAGS_writeBatchSize,
+                                     FLAGS_removeBatchSize};
+
+  StorageBench bench(setupConfig, benchOptions);
+
+  if (FLAGS_clusterMode) {
+    if (!bench.connect()) {
+      XLOGF(WARN, "Failed to connect to cluster");
+      return false;
+    }
+  } else {
+    if (!bench.setup()) {
+      XLOGF(WARN, "Failed to set up benchmark");
+      return false;
+    }
+  }
+
+  bench.generateChunkIds();
+
+  if (FLAGS_cleanupChunksBeforeBench) {
+    bench.cleanup();
+  }
+
+  bool runOK = true;
+
+  if (FLAGS_serverMode) {
+    XLOGF(WARN, "Waiting...");
+    while (true) {
+      ::sleep(1);
+    }
+  } else {
+    runOK = bench.run();
+  }
+
+  if (FLAGS_truncateChunks) {
+    bench.truncate();
+  }
+
+  if (FLAGS_cleanupChunks) {
+    bench.cleanup();
+  }
+
+  bench.teardown();
+
+  return runOK;
+}
+
+}  // namespace hf3fs::storage::benchmark
+
+int main(int argc, char **argv) {
+  folly::init(&argc, &argv, true);
+  hf3fs::monitor::Monitor::Config monitorConfig;
+
+  if (FLAGS_printMetrics || FLAGS_reportMetrics) {
+    if (FLAGS_printMetrics) {
+      monitorConfig.reporters(0).set_type("log");
+    } else if (FLAGS_reportMetrics) {
+      monitorConfig.reporters(0).set_type("monitor_collector");
+      monitorConfig.reporters(0).monitor_collector().set_remote_ip(FLAGS_monitorEndpoint);
+      monitorConfig.set_reporters_length(1);
+    }
+
+    auto monitorResult = hf3fs::monitor::Monitor::start(monitorConfig);
+    XLOGF_IF(FATAL, !monitorResult, "Failed to start monitor: {}", monitorResult.error());
+  }
+
+  bool ok = hf3fs::storage::benchmark::runBenchmarks();
+
+  hf3fs::monitor::Monitor::stop();
+  hf3fs::memory::shutdown();
+
+  return ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
--- a/benchmarks/storage_bench/StorageBench.h
+++ b/benchmarks/storage_bench/StorageBench.h
@@ -0,0 +1,895 @@
+#pragma once
+
+#include <boost/algorithm/string.hpp>
+#include <boost/core/ignore_unused.hpp>
+#include <common/utils/UtcTime.h>
+#include <folly/experimental/coro/Collect.h>
+#include <folly/futures/Barrier.h>
+#include <folly/stats/TDigest.h>
+#include <numeric>
+#include <optional>
+#include <random>
+#include <vector>
+
+#include "common/logging/LogInit.h"
+#include "common/net/ib/IBDevice.h"
+#include "common/utils/Duration.h"
+#include "common/utils/SysResource.h"
+#include "tests/lib/UnitTestFabric.h"
+
+namespace hf3fs::storage::benchmark {
+
+using namespace hf3fs::storage::client;
+
+class StorageBench : public test::UnitTestFabric {
+ public:
+  struct Options {
+    const size_t numChunks;
+    const size_t readSize;
+    const size_t writeSize;
+    const size_t batchSize;
+    const uint64_t numReadSecs;
+    const uint64_t numWriteSecs;
+    const uint64_t clientTimeoutMS;
+    const size_t numCoroutines;
+    const size_t numTestThreads;
+    const uint32_t randSeed = 0;
+    const uint16_t chunkIdPrefix = 0xFFFF;
+    const bool benchmarkNetwork = false;
+    const bool benchmarkStorage = false;
+    const bool ignoreIOError = false;
+    const bool injectRandomServerError = false;
+    const bool injectRandomClientError = false;
+    const bool retryPermanentError = false;
+    const bool verifyReadData = false;
+    const bool verifyReadChecksum = false;
+    const bool verifyWriteChecksum = true;
+    const bool randomShuffleChunkIds = true;
+    const bool generateTestData = true;
+    const bool sparseChunkIds = true;
+    const std::string statsFilePath = "./perfstats.csv";
+    const std::vector<std::string> ibvDevices = {};
+    const std::vector<std::string> ibnetZones = {};
+    const std::vector<net::Address> mgmtdEndpoints = {};
+    const std::string clusterId = kClusterId;
+    const uint32_t chainTableId = 0;
+    const uint32_t chainTableVersion = 0;
+    const std::vector<uint32_t> chainIds = {};
+    const std::vector<uint32_t> storageNodeIds = {};
+    const size_t memoryAlignment = 1;
+    const size_t readOffAlignment = 0;
+    const size_t defaultPKeyIndex = 1;
+    size_t readBatchSize = 0;
+    size_t writeBatchSize = 0;
+    size_t removeBatchSize = 0;
+  };
+
+ private:
+  static constexpr uint32_t kTDigestMaxSize = 1000;
+
+  struct ChunkInfo {
+    ChainId chainId;
+    ChunkId chunkId;
+    size_t size;
+  };
+
+  Options benchOptions_;
+  std::vector<folly::TDigest> writeLatencyDigests_;
+  std::vector<folly::TDigest> readLatencyDigests_;
+  folly::CPUThreadPoolExecutor testExecutor_;
+  std::atomic_uint64_t numWriteBytes_;
+  std::atomic_uint64_t numReadBytes_;
+  folly::Random::DefaultGenerator randGen_;
+  std::vector<std::vector<ChunkInfo>> chunkInfos_;
+  std::vector<size_t> numCreatedChunks_;
+  size_t totalNumChunks_;
+  double totalChunkGiB_;
+
+ public:
+  StorageBench(const test::SystemSetupConfig &setupConfig, const Options &options)
+      : UnitTestFabric(setupConfig),
+        benchOptions_(options),
+        writeLatencyDigests_(benchOptions_.numCoroutines, folly::TDigest(kTDigestMaxSize)),
+        readLatencyDigests_(benchOptions_.numCoroutines, folly::TDigest(kTDigestMaxSize)),
+        testExecutor_(benchOptions_.numTestThreads),
+        numWriteBytes_(0),
+        numReadBytes_(0),
+        randGen_(folly::Random::create()),
+        chunkInfos_(benchOptions_.numCoroutines),
+        numCreatedChunks_(benchOptions_.numCoroutines) {
+    if (benchOptions_.readBatchSize == 0) benchOptions_.readBatchSize = benchOptions_.batchSize;
+    if (benchOptions_.writeBatchSize == 0) benchOptions_.writeBatchSize = benchOptions_.batchSize;
+    if (benchOptions_.removeBatchSize == 0) benchOptions_.removeBatchSize = benchOptions_.batchSize;
+  }
+
+  void generateChunkIds() {
+    static_assert(sizeof(benchOptions_.chunkIdPrefix) == 2);
+    uint64_t chunkIdPrefix64 = ((uint64_t)benchOptions_.chunkIdPrefix) << (UINT64_WIDTH - UINT16_WIDTH);
+    std::sort(chainIds_.begin(), chainIds_.end());
+    static thread_local std::mt19937 generator;
+    randGen_.seed(benchOptions_.randSeed);
+
+    XLOGF(WARN,
+          "Generating {} chunk ids with prefix {:08X} and random seed {}...",
+          totalNumChunks_,
+          chunkIdPrefix64,
+          benchOptions_.randSeed);
+
+    for (auto &chunkInfos : chunkInfos_) {
+      uint64_t instancePrefix = chunkIdPrefix64 | folly::Random::rand64(randGen_);
+      XLOGF(DBG3, "Random chunk id prefix {:08X}", instancePrefix);
+
+      chunkInfos.reserve(chainIds_.size() * benchOptions_.numChunks);
+
+      for (auto chainId : chainIds_) {
+        for (size_t chunkIndex = 0; chunkIndex < benchOptions_.numChunks; chunkIndex++) {
+          if (benchOptions_.sparseChunkIds) {
+            uint64_t chunkIdHigh = chunkIdPrefix64 | (folly::Random::rand64(randGen_) & 0x000000FFFFFFFFFF);
+            uint64_t chunkIdLow = (folly::Random::rand64(randGen_) << UINT32_WIDTH) + chunkIndex;
+            chunkInfos.push_back({chainId, ChunkId(chunkIdHigh, chunkIdLow), 0});
+          } else {
+            chunkInfos.push_back({chainId, ChunkId(instancePrefix, chunkIndex), 0});
+          }
+        }
+      }
+
+      if (benchOptions_.randomShuffleChunkIds) std::shuffle(chunkInfos.begin(), chunkInfos.end(), generator);
+    }
+  }
+
+  bool connect() {
+    XLOGF(INFO, "Start to connect...");
+
+    if (!setupIBSock()) {
+      return false;
+    }
+
+    mgmtdClientConfig_.set_mgmtd_server_addresses(benchOptions_.mgmtdEndpoints);
+    mgmtdClientConfig_.set_enable_auto_refresh(true);
+    mgmtdClientConfig_.set_enable_auto_heartbeat(false);
+    mgmtdClientConfig_.set_enable_auto_extend_client_session(true);
+    mgmtdClientConfig_.set_auto_refresh_interval(3_s);
+    mgmtdClientConfig_.set_auto_heartbeat_interval(3_s);
+    mgmtdClientConfig_.set_auto_extend_client_session_interval(3_s);
+    mgmtdClientConfig_.set_accept_incomplete_routing_info_during_mgmtd_bootstrapping(false);
+
+    if (!client_.start()) {
+      XLOGF(ERR, "Failed to start net client for mgmtd client");
+      return false;
+    }
+
+    XLOGF(INFO, "Creating mgmtd client...");
+
+    auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<hf3fs::mgmtd::MgmtdServiceStub>>(
+        stubs::ClientContextCreator{[this](net::Address addr) { return client_.serdeCtx(addr); }});
+    auto mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForClient>(benchOptions_.clusterId,
+                                                                             std::move(stubFactory),
+                                                                             mgmtdClientConfig_);
+
+    auto physicalHostnameRes = SysResource::hostname(/*physicalMachineName=*/true);
+    if (!physicalHostnameRes) {
+      XLOGF(ERR, "getHostname(true) failed: {}", physicalHostnameRes.error());
+      return false;
+    }
+
+    auto containerHostnameRes = SysResource::hostname(/*physicalMachineName=*/false);
+    if (!containerHostnameRes) {
+      XLOGF(ERR, "getHostname(false) failed: {}", containerHostnameRes.error());
+      return false;
+    }
+
+    mgmtdClient->setClientSessionPayload({clientId_.uuid.toHexString(),
+                                          flat::NodeType::CLIENT,
+                                          flat::ClientSessionData::create(
+                                              /*universalId=*/*physicalHostnameRes,
+                                              /*description=*/fmt::format("StorageBench: {}", *containerHostnameRes),
+                                              /*serviceGroups=*/std::vector<flat::ServiceGroupInfo>{},
+                                              flat::ReleaseVersion::fromVersionInfo()),
+                                          flat::UserInfo{}});
+    folly::coro::blockingWait(mgmtdClient->start(&client_.tpg().bgThreadPool().randomPick()));
+    mgmtdForClient_ = std::move(mgmtdClient);
+
+    // get routing info
+
+    for (size_t retry = 0; retry < 15; retry++) {
+      auto routingInfo = mgmtdForClient_->getRoutingInfo();
+
+      if (routingInfo == nullptr || routingInfo->raw()->chains.empty()) {
+        XLOGF(WARN, "Empty routing info, #{} retry...", retry + 1);
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+      } else {
+        for (const auto &[tableId, tableVersions] : routingInfo->raw()->chainTables) {
+          if (tableId == benchOptions_.chainTableId) {
+            if (tableVersions.empty()) {
+              XLOGF(WARN, "No version found for chain table with id {}", tableId);
+              return false;
+            }
+
+            XLOGF(INFO, "Found {} version(s) of chain table {}", tableVersions.size(), benchOptions_.chainTableId);
+
+            flat::ChainTable chainTable;
+
+            if (benchOptions_.chainTableVersion > 0) {
+              flat::ChainTableVersion tableVersion(benchOptions_.chainTableVersion);
+              auto tableIter = tableVersions.find(tableVersion);
+
+              if (tableIter == tableVersions.end()) {
+                XLOGF(WARN, "Version {} not found in chain table with id {}", tableVersion, tableId);
+                return false;
+              }
+
+              chainTable = tableIter->second;
+              XLOGF(INFO,
+                    "Found version {} of chain table {}: {}",
+                    benchOptions_.chainTableVersion,
+                    benchOptions_.chainTableId,
+                    chainTable.chainTableVersion);
+            } else {
+              const auto iter = --tableVersions.cend();
+              const auto &latestTable = iter->second;
+              chainTable = latestTable;
+              XLOGF(INFO,
+                    "Found latest version of chain table {}: {}",
+                    benchOptions_.chainTableId,
+                    chainTable.chainTableVersion);
+            }
+
+            XLOGF(WARN,
+                  "Selected chain table: {}@{} [{}] {} chains",
+                  chainTable.chainTableId,
+                  chainTable.chainTableVersion,
+                  chainTable.desc,
+                  chainTable.chains.size());
+
+            if (!benchOptions_.storageNodeIds.empty()) {
+              for (const auto &chainId : chainTable.chains) {
+                const auto chainInfo = routingInfo->raw()->getChain(chainId);
+                for (const auto &target : chainInfo->targets) {
+                  const auto targetInfo = routingInfo->raw()->getTarget(target.targetId);
+                  auto nodeIter = std::find(benchOptions_.storageNodeIds.begin(),
+                                            benchOptions_.storageNodeIds.end(),
+                                            *targetInfo->nodeId);
+                  if (nodeIter != benchOptions_.storageNodeIds.end()) {
+                    chainIds_.push_back(chainId);
+                    break;
+                  }
+                }
+              }
+            } else if (!benchOptions_.chainIds.empty()) {
+              for (const auto &chainId : chainTable.chains) {
+                auto chainIter = std::find(benchOptions_.chainIds.begin(), benchOptions_.chainIds.end(), chainId);
+                if (chainIter != benchOptions_.chainIds.end()) {
+                  chainIds_.push_back(chainId);
+                }
+              }
+            } else {
+              chainIds_ = chainTable.chains;
+            }
+
+            break;
+          }
+        }
+
+        if (!chainIds_.empty()) break;
+      }
+    }
+
+    if (chainIds_.empty()) {
+      XLOGF(ERR, "Failed to get chain table with id {}", benchOptions_.chainTableId);
+      return false;
+    } else {
+      XLOGF(WARN, "Selected {} replication chains for benchmark", chainIds_.size());
+    }
+
+    // create storage client
+
+    if (setupConfig_.client_config().empty()) {
+      XLOGF(ERR, "Storage client config not specified");
+      return false;
+    }
+
+    auto configRes = clientConfig_.atomicallyUpdate(setupConfig_.client_config(), false /*isHotUpdate*/);
+    if (!configRes) {
+      XLOGF(ERR, "Cannot load client config from {}, error: {}", setupConfig_.client_config(), configRes.error());
+      return false;
+    }
+
+    totalNumChunks_ = chainIds_.size() * benchOptions_.numCoroutines * benchOptions_.numChunks;
+    totalChunkGiB_ = (double)totalNumChunks_ * setupConfig_.chunk_size() / 1_GB;
+    clientConfig_.retry().set_max_retry_time(Duration(std::chrono::milliseconds(benchOptions_.clientTimeoutMS)));
+    clientConfig_.net_client().io_worker().ibsocket().set_sl(setupConfig_.service_level());
+
+    XLOGF(INFO, "Creating storage client...");
+    storageClient_ = client::StorageClient::create(clientId_, clientConfig_, *mgmtdForClient_);
+
+    return true;
+  }
+
+  bool setupIBSock() {
+    XLOGF(WARN, "Setting up IB socket...");
+
+    std::vector<net::IBConfig::Subnet> subnets;
+
+    for (const auto &ibnetZoneStr : benchOptions_.ibnetZones) {
+      std::vector<std::string> ibnetZoneSubnet;
+      boost::split(ibnetZoneSubnet, ibnetZoneStr, boost::is_any_of(":"));
+
+      if (ibnetZoneSubnet.size() != 2) {
+        XLOGF(CRITICAL, "Invalid IB zone subnet: {}", ibnetZoneStr);
+        return false;
+      }
+
+      auto zone = boost::trim_copy(ibnetZoneSubnet[0]);
+      auto subnet = boost::trim_copy(ibnetZoneSubnet[1]);
+
+      if (zone.empty() || subnet.empty()) {
+        XLOGF(CRITICAL, "Invalid IB zone subnet: {}", ibnetZoneStr);
+        return false;
+      }
+
+      subnets.emplace_back();
+      subnets.back().set_network_zones({zone});
+      subnets.back().set_subnet(*net::IBConfig::Network::from(subnet));
+      XLOGF(WARN, "Add IB network zone: {} -- {}", zone, subnet);
+    }
+
+    net::IBConfig ibConfig;
+    ibConfig.set_subnets(subnets);
+    ibConfig.set_allow_unknown_zone(false);
+    ibConfig.set_default_network_zone("$HF3FS_NETWORK_ZONE");
+    ibConfig.set_device_filter(benchOptions_.ibvDevices);
+    ibConfig.set_default_pkey_index(benchOptions_.defaultPKeyIndex);
+
+    auto ibResult = net::IBManager::start(ibConfig);
+    if (ibResult.hasError()) {
+      XLOGF(CRITICAL, "Cannot initialize IB device: {}", ibResult.error());
+      return false;
+    }
+
+    return true;
+  }
+
+  bool setup() {
+    XLOGF(WARN, "Setting up benchmark...");
+
+    if (!setupIBSock()) {
+      return false;
+    }
+
+    bool ok = setUpStorageSystem();
+
+    totalNumChunks_ = chainIds_.size() * benchOptions_.numCoroutines * benchOptions_.numChunks;
+    totalChunkGiB_ = (double)totalNumChunks_ * setupConfig_.chunk_size() / 1_GB;
+    clientConfig_.retry().set_max_retry_time(Duration(std::chrono::milliseconds(benchOptions_.clientTimeoutMS)));
+
+    return ok;
+  }
+
+  void teardown() {
+    tearDownStorageSystem();
+    net::IBManager::stop();
+  }
+
+  void printThroughput(hf3fs::SteadyClock::duration elapsedMicro, double totalGiB) {
+    auto elapsedMilli = std::chrono::duration_cast<std::chrono::milliseconds>(elapsedMicro);
+    double throughput = totalGiB / (elapsedMilli.count() / 1000.0);
+    XLOGF(WARN, "Average throughput: {:.3f}GiB/s, total {:.3f} GiB", throughput, totalGiB);
+  }
+
+  void printLatencyDigest(const folly::TDigest &digest) {
+    XLOGF(WARN, "latency summary ({} samples)", digest.count());
+    XLOGF(WARN, "min: {:10.1f}us", digest.min());
+    XLOGF(WARN, "max: {:10.1f}us", digest.max());
+    XLOGF(WARN, "avg: {:10.1f}us", digest.mean());
+    for (double p : {0.1, 0.2, 0.5, 0.9, 0.95, 0.99}) {
+      XLOGF(WARN, "{}%: {:10.1f}us", p * 100.0, digest.estimateQuantile(p));
+    }
+  }
+
+  void dumpPerfStats(const std::string &testName,
+                     const folly::TDigest &digest,
+                     hf3fs::SteadyClock::duration elapsedTime,
+                     double totalGiB,
+                     bool readIO) {
+    if (benchOptions_.statsFilePath.empty()) return;
+
+    boost::filesystem::path outFilePath(benchOptions_.statsFilePath);
+
+    if (!boost::filesystem::exists(outFilePath) || boost::filesystem::is_empty(outFilePath)) {
+      XLOGF(INFO, "Create a file for perfermance stats at {}", outFilePath);
+      boost::filesystem::save_string_file(
+          outFilePath,
+          "test name,#storages,#chains,#replicas,concurrency,batch size,"
+          "io size (bytes),effective batch size (batch size / #replicas),elapsed time (us),"
+          "QPS,IOPS,bandwidth (MB/s),latency samples,min latency (us),max latency (us),avg latency (us),"
+          "latency P50 (us),latency P75 (us),latency P90 (us),latency P95 (us),latency P99 (us)\n");
+    }
+
+    auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(elapsedTime);
+    double bandwidthMBps = totalGiB * 1024.0 / (elapsedMicro.count() / 1'000'000.0);
+    size_t ioSize = readIO ? benchOptions_.readSize : benchOptions_.writeSize;
+    size_t batchSize = readIO ? benchOptions_.readBatchSize : benchOptions_.writeBatchSize;
+    double iops = bandwidthMBps * 1024.0 * 1024.0 / ioSize;
+    double qps = bandwidthMBps * 1024.0 * 1024.0 / (batchSize * ioSize);
+
+    boost::filesystem::ofstream fout(outFilePath, std::ios_base::app);
+
+    fout << fmt::format("{},{},{},{},{},{},{},{:.1f},{},{:.1f},{:.1f},{:.3f},{},{:.1f},{:.1f},{:.1f}",
+                        testName,
+                        setupConfig_.num_storage_nodes(),
+                        setupConfig_.num_chains(),
+                        setupConfig_.num_replicas(),
+                        benchOptions_.numCoroutines,
+                        batchSize,
+                        ioSize,
+                        double(batchSize) / setupConfig_.num_storage_nodes(),
+                        elapsedMicro.count(),
+                        qps,
+                        iops,
+                        bandwidthMBps,
+                        digest.count(),
+                        digest.min(),
+                        digest.max(),
+                        digest.mean());
+
+    for (double p : {0.5, 0.75, 0.9, 0.95, 0.99}) {
+      fout << fmt::format(",{:.1f}", digest.estimateQuantile(p));
+    }
+
+    fout << "\n";
+    fout.close();
+  }
+
+  CoTask<uint32_t> batchWrite(uint32_t instanceId, size_t writeBatchSize, size_t writeSize, uint32_t numWriteSecs) {
+    // create an aligned memory block
+    size_t memoryBlockSize = ALIGN_UPPER(setupConfig_.chunk_size(), benchOptions_.memoryAlignment);
+    auto memoryBlock = (uint8_t *)folly::aligned_malloc(memoryBlockSize, sysconf(_SC_PAGESIZE));
+    auto deleter = [](uint8_t *ptr) { folly::aligned_free(ptr); };
+    std::unique_ptr<uint8_t, decltype(deleter)> memoryBlockPtr(memoryBlock, deleter);
+    std::memset(memoryBlock, 0xFF, memoryBlockSize);
+
+    if (benchOptions_.verifyReadData) {
+      for (size_t byteIndex = 0; byteIndex < memoryBlockSize; byteIndex++) {
+        memoryBlock[byteIndex] = byteIndex;
+      }
+    }
+
+    // register a block of memory
+    auto regRes = storageClient_->registerIOBuffer(memoryBlock, memoryBlockSize);
+
+    if (regRes.hasError()) {
+      co_return regRes.error().code();
+    }
+
+    // create write IOs
+
+    auto ioBuffer = std::move(*regRes);
+
+    WriteOptions options;
+    options.set_enableChecksum(benchOptions_.verifyWriteChecksum);
+    options.debug().set_bypass_disk_io(benchOptions_.benchmarkNetwork);
+    options.debug().set_bypass_rdma_xmit(benchOptions_.benchmarkStorage);
+    options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
+    options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
+    options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
+
+    std::vector<double> elapsedMicroSecs;
+    uint64_t numWriteBytes = 0;
+
+    std::vector<WriteIO> writeIOs;
+    writeIOs.reserve(writeBatchSize);
+
+    auto benchStart = hf3fs::SteadyClock::now();
+    std::vector<ChunkInfo> &chunkInfos = chunkInfos_[instanceId];
+    size_t &numCreatedChunks = numCreatedChunks_[instanceId];
+    size_t seqChunkIndex = 0;
+
+    while (true) {
+      if (numWriteSecs) {
+        auto accumElapsedSecs =
+            std::chrono::duration_cast<std::chrono::seconds>(hf3fs::SteadyClock::now() - benchStart);
+        if (accumElapsedSecs >= std::chrono::seconds(numWriteSecs)) break;
+      } else {
+        if (numCreatedChunks >= chunkInfos.size()) break;
+      }
+
+      writeIOs.clear();
+
+      for (size_t writeIndex = 0; writeIndex < writeBatchSize; writeIndex++) {
+        auto &[chainId, chunkId, chunkSize] = chunkInfos[seqChunkIndex++ % chunkInfos.size()];
+        size_t writeOffset = 0;
+        size_t writeLength = 0;
+
+        if (chunkSize < setupConfig_.chunk_size()) {
+          writeOffset = chunkSize;
+          writeLength = std::min(writeSize, setupConfig_.chunk_size() - writeOffset);
+          chunkSize += writeLength;
+          numCreatedChunks += chunkSize == setupConfig_.chunk_size();
+        } else {
+          writeOffset = folly::Random::rand32(0, setupConfig_.chunk_size() - writeSize);
+          writeLength = writeSize;
+        }
+
+        auto writeIO = storageClient_->createWriteIO(chainId,
+                                                     chunkId,
+                                                     writeOffset,
+                                                     writeLength,
+                                                     setupConfig_.chunk_size(),
+                                                     &memoryBlock[writeOffset],
+                                                     &ioBuffer);
+        writeIOs.push_back(std::move(writeIO));
+        numWriteBytes += writeLength;
+      }
+
+      auto rpcStart = hf3fs::SteadyClock::now();
+
+      co_await storageClient_->batchWrite(writeIOs, flat::UserInfo(), options);
+
+      auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(hf3fs::SteadyClock::now() - rpcStart);
+      elapsedMicroSecs.push_back(elapsedMicro.count());
+
+      if (!benchOptions_.ignoreIOError) {
+        for (const auto &writeIO : writeIOs) {
+          if (writeIO.result.lengthInfo.hasError()) {
+            XLOGF(ERR, "Error in write result: {}", writeIO.result);
+            co_return writeIO.result.lengthInfo.error().code();
+          }
+          if (writeIO.length != *writeIO.result.lengthInfo) {
+            XLOGF(ERR, "Unexpected write length: {} != {}", *writeIO.result.lengthInfo, writeIO.length);
+            co_return StorageClientCode::kRemoteIOError;
+          }
+        }
+      }
+    }
+
+    folly::TDigest digest;
+    writeLatencyDigests_[instanceId] = digest.merge(elapsedMicroSecs);
+    numWriteBytes_ += numWriteBytes;
+
+    co_return StatusCode::kOK;
+  }
+
+  CoTask<uint32_t> batchRead(uint32_t instanceId) {
+    // create an aligned memory block
+    size_t alignedBufSize = ALIGN_UPPER(std::max(size_t(1), benchOptions_.readSize), benchOptions_.memoryAlignment);
+    size_t memoryBlockSize = alignedBufSize * benchOptions_.readBatchSize;
+    auto memoryBlock = (uint8_t *)folly::aligned_malloc(memoryBlockSize, sysconf(_SC_PAGESIZE));
+    auto deleter = [](uint8_t *ptr) { folly::aligned_free(ptr); };
+    std::unique_ptr<uint8_t, decltype(deleter)> memoryBlockPtr(memoryBlock, deleter);
+    std::memset(memoryBlock, 0, memoryBlockSize);
+
+    // register a block of memory
+    auto regRes = storageClient_->registerIOBuffer(memoryBlock, memoryBlockSize);
+
+    if (regRes.hasError()) {
+      co_return regRes.error().code();
+    }
+
+    std::vector<uint8_t> expectedChunkData(setupConfig_.chunk_size());
+
+    if (benchOptions_.verifyReadData) {
+      for (size_t byteIndex = 0; byteIndex < expectedChunkData.size(); byteIndex++) {
+        expectedChunkData[byteIndex] = byteIndex;
+      }
+    }
+
+    // create read IOs
+
+    auto ioBuffer = std::move(*regRes);
+
+    ReadOptions options;
+    options.set_enableChecksum(benchOptions_.verifyReadChecksum);
+    options.debug().set_bypass_disk_io(benchOptions_.benchmarkNetwork);
+    options.debug().set_bypass_rdma_xmit(benchOptions_.benchmarkStorage);
+    options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
+    options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
+    options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
+
+    std::vector<double> elapsedMicroSecs;
+    uint64_t numReadBytes = 0;
+    size_t offsetAlignment =
+        benchOptions_.readOffAlignment ? benchOptions_.readOffAlignment : std::max(size_t(1), benchOptions_.readSize);
+
+    std::vector<client::ReadIO> readIOs;
+    readIOs.reserve(benchOptions_.readBatchSize);
+
+    auto benchStart = hf3fs::SteadyClock::now();
+    std::vector<ChunkInfo> &chunkInfos = chunkInfos_[instanceId];
+
+    while (true) {
+      auto accumElapsedSecs = std::chrono::duration_cast<std::chrono::seconds>(hf3fs::SteadyClock::now() - benchStart);
+      if (accumElapsedSecs >= std::chrono::seconds(benchOptions_.numReadSecs)) break;
+
+      readIOs.clear();
+
+      for (size_t readIndex = 0; readIndex < benchOptions_.readBatchSize; readIndex++) {
+        uint64_t randChunkIndex = folly::Random::rand64(0, chunkInfos.size());
+        const auto &[chainId, chunkId, chunkSize] = chunkInfos[randChunkIndex];
+        uint32_t offset = folly::Random::rand32(0, setupConfig_.chunk_size() - benchOptions_.readSize);
+        uint32_t alignedOffset = ALIGN_LOWER(offset, offsetAlignment);
+        auto readIO = storageClient_->createReadIO(chainId,
+                                                   chunkId,
+                                                   alignedOffset /*offset*/,
+                                                   benchOptions_.readSize /*length*/,
+                                                   &memoryBlock[readIndex * alignedBufSize],
+                                                   &ioBuffer);
+        readIOs.push_back(std::move(readIO));
+        numReadBytes += benchOptions_.readSize;
+      }
+
+      auto rpcStart = hf3fs::SteadyClock::now();
+
+      co_await storageClient_->batchRead(readIOs, flat::UserInfo(), options);
+
+      auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(hf3fs::SteadyClock::now() - rpcStart);
+      elapsedMicroSecs.push_back(elapsedMicro.count());
+
+      if (!benchOptions_.ignoreIOError) {
+        for (const auto &readIO : readIOs) {
+          if (readIO.result.lengthInfo.hasError()) {
+            XLOGF(ERR, "Error in read result: {}", readIO.result);
+            co_return readIO.result.lengthInfo.error().code();
+          }
+          if (readIO.length != *readIO.result.lengthInfo) {
+            XLOGF(ERR, "Unexpected read length: {} != {}", *readIO.result.lengthInfo, readIO.length);
+            co_return StorageClientCode::kRemoteIOError;
+          }
+        }
+      }
+
+      if (benchOptions_.verifyReadData) {
+        for (const auto &readIO : readIOs) {
+          auto diffPos = std::mismatch(&readIO.data[0], &readIO.data[readIO.length], &expectedChunkData[readIO.offset]);
+          uint32_t byteIndex = diffPos.first - &readIO.data[0];
+          if (byteIndex < readIO.length) {
+            XLOGF(ERR,
+                  "Wrong data at bytes index {} and chunk offset {}: data {:#x} != expected {:#x}",
+                  byteIndex,
+                  readIO.offset + byteIndex,
+                  *diffPos.first,
+                  *diffPos.second);
+            co_return StorageClientCode::kFoundBug;
+          }
+        }
+      }
+    }
+
+    folly::TDigest digest;
+    readLatencyDigests_[instanceId] = digest.merge(elapsedMicroSecs);
+    numReadBytes_ += numReadBytes;
+
+    co_return StatusCode::kOK;
+  }
+
+  uint32_t generateChunks() {
+    XLOGF(WARN, "Generating {} test chunks ({:.3f} GiB)...", totalNumChunks_, totalChunkGiB_);
+
+    auto testStart = hf3fs::SteadyClock::now();
+    std::vector<folly::SemiFuture<uint32_t>> writeTasks;
+    numWriteBytes_ = 0;
+
+    size_t writeBatchSize =
+        std::max(benchOptions_.writeBatchSize,
+                 clientConfig_.traffic_control().write().max_concurrent_requests() / benchOptions_.numCoroutines);
+
+    for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
+      writeTasks.push_back(batchWrite(instanceId, writeBatchSize, setupConfig_.chunk_size(), 0 /*numWriteSecs*/)
+                               .scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_))
+                               .start());
+    }
+
+    auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(writeTasks)));
+
+    for (auto res : results) {
+      if (res != StatusCode::kOK) {
+        XLOGF(WARN, "Test task failed with status code: {}", res);
+        return res;
+      }
+    }
+
+    auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
+    double totalGiB = (double)numWriteBytes_ / 1_GB;
+    printThroughput(elapsedTime, totalGiB);
+
+    auto mergedDigest = folly::TDigest::merge(writeLatencyDigests_);
+    printLatencyDigest(mergedDigest);
+
+    return StatusCode::kOK;
+  }
+
+  uint32_t runWriteBench() {
+    XLOGF(WARN,
+          "Running write benchmark ({} secs, {} chunks, {:.3f} GiB)...",
+          benchOptions_.numWriteSecs,
+          totalNumChunks_,
+          totalChunkGiB_);
+
+    auto testStart = hf3fs::SteadyClock::now();
+    std::vector<folly::SemiFuture<uint32_t>> writeTasks;
+    numWriteBytes_ = 0;
+
+    for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
+      writeTasks.push_back(
+          batchWrite(instanceId, benchOptions_.writeBatchSize, benchOptions_.writeSize, benchOptions_.numWriteSecs)
+              .scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_))
+              .start());
+    }
+
+    auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(writeTasks)));
+
+    for (auto res : results) {
+      if (res != StatusCode::kOK) {
+        XLOGF(WARN, "Test task failed with status code: {}", res);
+        return res;
+      }
+    }
+
+    auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
+    double totalGiB = (double)numWriteBytes_ / 1_GB;
+    printThroughput(elapsedTime, totalGiB);
+
+    auto mergedDigest = folly::TDigest::merge(writeLatencyDigests_);
+    printLatencyDigest(mergedDigest);
+
+    dumpPerfStats("batch write", mergedDigest, elapsedTime, totalGiB, false /*readIO*/);
+
+    return StatusCode::kOK;
+  }
+
+  uint32_t runReadBench() {
+    XLOGF(WARN, "Running read benchmark ({} secs)...", benchOptions_.numReadSecs);
+
+    auto testStart = hf3fs::SteadyClock::now();
+    std::vector<folly::SemiFuture<uint32_t>> readTasks;
+    numReadBytes_ = 0;
+
+    for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
+      readTasks.push_back(batchRead(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
+    }
+
+    auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(readTasks)));
+
+    for (auto res : results) {
+      if (res != StatusCode::kOK) {
+        XLOGF(WARN, "Test task failed with status code: {}", res);
+        return res;
+      }
+    }
+
+    auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
+    double totalGiB = (double)numReadBytes_ / 1_GB;
+    printThroughput(elapsedTime, totalGiB);
+
+    auto mergedDigest = folly::TDigest::merge(readLatencyDigests_);
+    printLatencyDigest(mergedDigest);
+
+    dumpPerfStats("batch read", mergedDigest, elapsedTime, totalGiB, false /*readIO*/);
+
+    return StatusCode::kOK;
+  }
+
+  uint32_t cleanup() {
+    XLOGF(WARN, "Clean up chunks...");
+
+    std::vector<folly::SemiFuture<uint32_t>> removeTasks;
+
+    for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
+      auto batchRemove = [this](size_t instanceId) -> folly::coro::Task<uint32_t> {
+        std::vector<client::RemoveChunksOp> removeOps;
+        size_t totalNumChunksRemoved = 0;
+
+        for (const auto &[chainId, chunkId, chunkSize] : chunkInfos_[instanceId]) {
+          removeOps.push_back(storageClient_->createRemoveOp(chainId, chunkId, ChunkId(chunkId, 1)));
+
+          if (removeOps.size() >= benchOptions_.removeBatchSize) {
+            WriteOptions options;
+            options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
+            options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
+            options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
+
+            co_await storageClient_->removeChunks(removeOps, flat::UserInfo(), options);
+
+            for (const auto &removeOp : removeOps) {
+              if (removeOp.result.statusCode.hasError()) {
+                XLOGF(WARN, "Remove operation failed with error: {}", removeOp.result.statusCode.error());
+                co_return removeOp.result.statusCode.error().code();
+              }
+
+              XLOGF_IF(DBG5,
+                       removeOp.result.numChunksRemoved != 1,
+                       "{} chunks removed in range {}",
+                       removeOp.result.numChunksRemoved,
+                       removeOp.chunkRange());
+              totalNumChunksRemoved += removeOp.result.numChunksRemoved;
+            }
+
+            removeOps.clear();
+          }
+        }
+
+        XLOGF(WARN, "{} chunks removed by instance #{}", totalNumChunksRemoved, instanceId);
+        co_return StatusCode::kOK;
+      };
+
+      removeTasks.push_back(
+          batchRemove(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
+    }
+
+    auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(removeTasks)));
+
+    for (auto res : results) {
+      if (res != StatusCode::kOK) {
+        XLOGF(WARN, "Test task failed with status code: {}", res);
+        return res;
+      }
+    }
+
+    return StatusCode::kOK;
+  };
+
+  uint32_t truncate() {
+    XLOGF(WARN, "Truncate chunks...");
+
+    std::vector<folly::SemiFuture<uint32_t>> truncateTasks;
+
+    for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
+      auto batchTruncate = [this](size_t instanceId) -> folly::coro::Task<uint32_t> {
+        std::vector<client::TruncateChunkOp> truncateOps;
+
+        for (const auto &[chainId, chunkId, chunkSize] : chunkInfos_[instanceId]) {
+          truncateOps.push_back(storageClient_->createTruncateOp(chainId, chunkId, 0, setupConfig_.chunk_size()));
+
+          if (truncateOps.size() >= benchOptions_.writeBatchSize) {
+            WriteOptions options;
+            options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
+            options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
+            options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
+
+            co_await storageClient_->truncateChunks(truncateOps, flat::UserInfo(), options);
+
+            for (const auto &truncateOp : truncateOps) {
+              if (truncateOp.result.lengthInfo.hasError()) {
+                XLOGF(WARN, "Truncate operation failed with error: {}", truncateOp.result.lengthInfo.error());
+                co_return truncateOp.result.lengthInfo.error().code();
+              }
+            }
+
+            truncateOps.clear();
+          }
+        }
+
+        co_return StatusCode::kOK;
+      };
+
+      truncateTasks.push_back(
+          batchTruncate(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
+    }
+
+    auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(truncateTasks)));
+
+    for (auto res : results) {
+      if (res != StatusCode::kOK) {
+        XLOGF(WARN, "Test task failed with status code: {}", res);
+        return res;
+      }
+    }
+
+    return StatusCode::kOK;
+  };
+
+  bool run() {
+    if (benchOptions_.numWriteSecs > 0)
+      if (runWriteBench() != StatusCode::kOK) return false;
+    if (benchOptions_.generateTestData)
+      if (generateChunks() != StatusCode::kOK) return false;
+    if (benchOptions_.numReadSecs > 0)
+      if (runReadBench() != StatusCode::kOK) return false;
+    return true;
+  }
+
+  uint64_t getWriteBytes() { return numWriteBytes_; }
+
+  uint64_t getReadBytes() { return numReadBytes_; }
+};
+
+}  // namespace hf3fs::storage::benchmark
--- a/cmake/AddCrate.cmake
+++ b/cmake/AddCrate.cmake
@@ -0,0 +1,33 @@
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CARGO_CMD cargo build)
+    set(TARGET_DIR "debug")
+else ()
+    set(CARGO_CMD cargo build --release)
+    set(TARGET_DIR "release")
+endif ()
+
+add_custom_target(
+    cargo_build_all ALL
+    COMMAND ${CARGO_CMD}
+    WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
+)
+
+macro(add_crate NAME)
+    set(LIBRARY "${PROJECT_SOURCE_DIR}/target/${TARGET_DIR}/lib${NAME}.a")
+    set(SOURCES
+        "${PROJECT_SOURCE_DIR}/target/cxxbridge/${NAME}/src/cxx.rs.h"
+        "${PROJECT_SOURCE_DIR}/target/cxxbridge/${NAME}/src/cxx.rs.cc"
+    )
+
+    add_custom_command(
+        OUTPUT ${SOURCES} ${LIBRARY}
+        COMMAND ${CARGO_CMD}
+        WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/${NAME}"
+    )
+
+    add_library(${NAME} STATIC ${SOURCES} ${LIBRARY})
+    target_link_libraries(${NAME} pthread dl ${LIBRARY})
+    target_include_directories(${NAME} PUBLIC "${PROJECT_SOURCE_DIR}/target/cxxbridge")
+    target_compile_options(${NAME} PUBLIC -Wno-dollar-in-identifier-extension)
+    add_dependencies(${NAME} cargo_build_all)
+endmacro()
--- a/cmake/ApacheArrow.cmake
+++ b/cmake/ApacheArrow.cmake
@@ -0,0 +1,54 @@
+add_library(apache_arrow_static INTERFACE)
+add_library(arrow_static STATIC IMPORTED)
+add_library(parquet_static STATIC IMPORTED)
+add_library(arrow_dependencies STATIC IMPORTED)
+
+set(PREFIX "${CMAKE_CURRENT_BINARY_DIR}")
+set(ARROW_RELEASE_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/src/apache-arrow-cpp/cpp/build/release")
+
+# https://cmake.org/cmake/help/latest/policy/CMP0097.html
+# Starting with CMake 3.16, explicitly setting GIT_SUBMODULES to an empty string
+# means no submodules will be initialized or updated.
+cmake_policy(SET CMP0097 NEW)
+
+include(ExternalProject)
+ExternalProject_Add(
+    apache-arrow-cpp
+    PREFIX ${PREFIX}
+    GIT_REPOSITORY https://github.com/apache/arrow.git
+    GIT_TAG b7d2f7ffca66c868bd2fce5b3749c6caa002a7f0
+    GIT_SHALLOW ON
+    GIT_PROGRESS ON
+    GIT_SUBMODULES ""
+    SOURCE_SUBDIR "cpp"
+    BUILD_IN_SOURCE ON
+    INSTALL_DIR ${PREFIX}
+    CONFIGURE_COMMAND bash -x -c "\
+    ( cd thirdparty && [[ -f export.sh ]] || ./download_dependencies.sh | tee export.sh ) && \
+    source thirdparty/export.sh && cmake -S . -B . \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DARROW_USE_CCACHE=OFF \
+        -DARROW_USE_SCCACHE=OFF \
+        -DARROW_DEPENDENCY_SOURCE=BUNDLED \
+        -DARROW_BUILD_STATIC=ON \
+        -DARROW_JEMALLOC=ON \
+        -DARROW_SIMD_LEVEL=DEFAULT \
+        -DARROW_BUILD_EXAMPLES=OFF \
+        -DARROW_PARQUET=ON -DARROW_CSV=ON \
+        -DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_ZLIB=ON"
+    BUILD_COMMAND bash -x -c "source thirdparty/export.sh && cmake --build . -j"
+    INSTALL_COMMAND cmake --install . --prefix "${PREFIX}"
+    BUILD_BYPRODUCTS
+        "${ARROW_RELEASE_BUILD_DIR}/libarrow.a"
+        "${ARROW_RELEASE_BUILD_DIR}/libparquet.a"
+        "${ARROW_RELEASE_BUILD_DIR}/libarrow_bundled_dependencies.a"
+  )
+
+add_dependencies(arrow_static apache-arrow-cpp)
+add_dependencies(parquet_static apache-arrow-cpp)
+add_dependencies(arrow_dependencies apache-arrow-cpp)
+set_target_properties(arrow_static PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libarrow.a")
+set_target_properties(parquet_static PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libparquet.a")
+set_target_properties(arrow_dependencies PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libarrow_bundled_dependencies.a")
+target_include_directories(apache_arrow_static SYSTEM INTERFACE "${PREFIX}/include")
+target_link_libraries(apache_arrow_static INTERFACE parquet_static arrow_static arrow_dependencies)
--- a/cmake/CLangFormat.cmake
+++ b/cmake/CLangFormat.cmake
@@ -0,0 +1,24 @@
+set(CLANG_FORMAT "/usr/bin/clang-format-14")
+if(EXISTS ${CLANG_FORMAT})
+    message(STATUS "Found clang-format at ${CLANG_FORMAT}")
+
+    set(SOURCE_DIRS
+        ${CMAKE_SOURCE_DIR}/src
+        ${CMAKE_SOURCE_DIR}/tests
+        ${CMAKE_SOURCE_DIR}/benchmarks
+    )
+
+    # For now, it just hard codes the source files list to globs. That works
+    # fine until we have another directory in `src/`. We should ideally gather
+    # this from SOURCE_FILES list. But, should filter the thirs_party sources.
+    # Taking a quick route for now. We should deal with it sometime down the line.
+    add_custom_target(format
+            COMMENT "Running clang-format"
+            COMMAND find ${SOURCE_DIRS} -name '*.cc' -o -name '*.cpp' -o -name '*.h' | grep -v "_generated.h" | xargs ${CLANG_FORMAT} -i)
+
+    add_custom_target(check-format
+            COMMENT "Running clang-format"
+            COMMAND find ${SOURCE_DIRS} -name '*.cc' -o -name '*.cpp' -o -name '*.h' | grep -v "_generated.h" | xargs ${CLANG_FORMAT} --Werror --dry-run)
+else()
+    message(FATAL_ERROR "clang-format-14 not found")
+endif()
--- a/cmake/CLangTidy.cmake
+++ b/cmake/CLangTidy.cmake
@@ -0,0 +1,52 @@
+# clang-tidy generates too many warnings, so just disable it by default.
+option(ENABLE_CLANG_TIDY "Run clang-tidy during build" OFF)
+
+find_program(CLANG_TIDY NAMES clang-tidy-14)
+if(CLANG_TIDY)
+    if(CMake_SOURCE_DIR STREQUAL CMake_BINARY_DIR)
+        message(FATAL_ERROR "CMake_RUN_CLANG_TIDY requires an out-of-source build!")
+    endif()
+
+    if(NOT CMAKE_EXPORT_COMPILE_COMMANDS)
+        message(WARNING "CMAKE_EXPORT_COMPILE_COMMANDS=OFF, clang-tidy may not works!!!")
+    endif()
+
+    set(HEADER_FILTER "${CMAKE_SOURCE_DIR}/\\(src\\|tests\\|benchmarks\\|demos\\)")
+
+    if(ENABLE_CLANG_TIDY)
+        set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} --header-filter ${HEADER_FILTER})
+
+        # Create a preprocessor definition that depends on .clang-tidy content so
+        # the compile command will change when .clang-tidy changes.  This ensures
+        # that a subsequent build re-runs clang-tidy on all sources even if they
+        # do not otherwise need to be recompiled.  Nothing actually uses this
+        # definition.  We add it to targets on which we run clang-tidy just to
+        # get the build dependency on the .clang-tidy file.
+        file(SHA1 ${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy clang_tidy_sha1)
+        set(CLANG_TIDY_DEFINITIONS "CLANG_TIDY_SHA1=${clang_tidy_sha1}")
+        unset(clang_tidy_sha1)
+
+        configure_file(.clang-tidy .clang-tidy COPYONLY)
+    endif()
+
+    set(SOURCE_DIRS
+        ${CMAKE_SOURCE_DIR}/src
+        ${CMAKE_SOURCE_DIR}/tests
+        ${CMAKE_SOURCE_DIR}/demos
+        ${CMAKE_SOURCE_DIR}/benchmarks
+    )
+
+    # For now, it just hard codes the source files list to globs. That works
+    # fine until we have another directory in `src/`. We should ideally gather
+    # this from SOURCE_FILES list. But, should filter the thirs_party sources.
+    # Taking a quick route for now. We should deal with it sometime down the line.
+    add_custom_target(clang-tidy
+        COMMENT "Running clang-tidy"
+        COMMAND run-clang-tidy-14 -header-filter ${HEADER_FILTER} `find ${SOURCE_DIRS} -name "*.cc" -o -name "*.cpp" -not -name "*.actor.cpp" ` -quiet)
+
+    add_custom_target(clang-tidy-fix
+        COMMENT "Running clang-tidy -fix"
+        COMMAND run-clang-tidy-14 -header-filter ${HEADER_FILTER} `find ${SOURCE_DIRS} -name "*.cc" -o -name "*.cpp" -not -name "*.actor.cpp" ` -fix -quiet)
+else()
+    message(WARNING "clang-tidy-14 not found!!!")
+endif()
--- a/cmake/CTestCustom.cmake
+++ b/cmake/CTestCustom.cmake
@@ -0,0 +1,12 @@
+file (STRINGS "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" LINES)
+
+# overwrite the file....
+file(WRITE "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" "")
+
+# loop through the lines,
+foreach(LINE IN LISTS LINES)
+  # remove unwanted parts
+  string(REGEX REPLACE ".*third_party.*" "" STRIPPED "${LINE}")
+  # and write the (changed) line ...
+  file(APPEND "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" "${STRIPPED}\n")
+endforeach()
--- a/cmake/CodeCoverage.cmake
+++ b/cmake/CodeCoverage.cmake
@@ -0,0 +1,27 @@
+option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
+if(ENABLE_CODE_COVERAGE)
+    if(CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" )
+        message(STATUS "Enable code coverage with debug mode.")
+    else()
+        message(WARNING "Code coverage with no debug mode!!!")
+    endif()
+
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(COVERAGE_COMPILER_FLAGS "-g -fprofile-arcs -ftest-coverage")
+        link_libraries(gcov)
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang")
+        set(COVERAGE_COMPILER_FLAGS "-g -fprofile-instr-generate -fcoverage-mapping")
+    else()
+        message(FATAL_ERROR "Unknown compiler iid ${CMAKE_CXX_COMPILER_ID}")
+    endif()
+
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag(-fprofile-abs-path HAVE_fprofile_abs_path)
+    if(HAVE_fprofile_abs_path)
+        set(COVERAGE_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS} -fprofile-abs-path")
+    endif()
+
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}")
+    message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}")
+endif()
--- a/cmake/CompileFlags.cmake
+++ b/cmake/CompileFlags.cmake
@@ -0,0 +1,21 @@
+macro(store_compile_flags)
+set(ORIGINAL_C_FLAGS "${CMAKE_C_FLAGS}")
+set(ORIGINAL_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+set(ORIGINAL_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
+set(ORIGINAL_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
+set(ORIGINAL_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+set(ORIGINAL_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+set(ORIGINAL_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+set(ORIGINAL_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+endmacro()
+
+macro(restore_compile_flags)
+set(CMAKE_C_FLAGS "${ORIGINAL_C_FLAGS}")
+set(CMAKE_C_FLAGS_DEBUG "${ORIGINAL_C_FLAGS_DEBUG}")
+set(CMAKE_C_FLAGS_RELEASE "${ORIGINAL_C_FLAGS_RELEASE}")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${ORIGINAL_C_FLAGS_RELWITHDEBINFO}")
+set(CMAKE_CXX_FLAGS "${ORIGINAL_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS_DEBUG "${ORIGINAL_CXX_FLAGS_DEBUG}")
+set(CMAKE_CXX_FLAGS_RELEASE "${ORIGINAL_CXX_FLAGS_RELEASE}")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${ORIGINAL_CXX_FLAGS_RELWITHDEBINFO}")
+endmacro()
--- a/cmake/DumpConfig.cmake
+++ b/cmake/DumpConfig.cmake
@@ -0,0 +1,15 @@
+if (ENABLE_FUSE_APPLICATION)
+add_custom_target(dump-config
+            COMMENT "Running dump config"
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'admin_cli' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`.toml'
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_app_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_app.toml'
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_launcher_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_launcher.toml')
+add_dependencies(dump-config meta_main mgmtd_main storage_main admin_cli hf3fs_fuse_main)
+else()
+add_custom_target(dump-config
+            COMMENT "Running dump config"
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'admin_cli' | xargs -I {} bash -c '{} --dump_default_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`.toml'
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' | xargs -I {} bash -c '{} --dump_default_app_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_app.toml'
+            COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' | xargs -I {} bash -c '{} --dump_default_launcher_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_launcher.toml')
+add_dependencies(dump-config meta_main mgmtd_main storage_main admin_cli)
+endif()
--- a/cmake/GitVersion.cmake
+++ b/cmake/GitVersion.cmake
@@ -0,0 +1,127 @@
+find_package(Git REQUIRED)
+
+if (NOT DEFINED PRE_CONFIGURE_DIR)
+    set(PRE_CONFIGURE_DIR ${PROJECT_SOURCE_DIR}/src/common/utils)
+endif ()
+
+if (NOT DEFINED POST_BUILD_DIR)
+    set(POST_BUILD_DIR ${PROJECT_BINARY_DIR})
+endif ()
+
+set(PRE_CONFIGURE_FILE ${PRE_CONFIGURE_DIR}/VersionInfo.cc.in)
+set(POST_CONFIGURE_FILE ${POST_BUILD_DIR}/src/common/utils/VersionInfo.cc)
+
+function(CheckGitWrite git_hash)
+    file(WRITE ${POST_BUILD_DIR}/git-state.txt ${git_hash})
+endfunction()
+
+function(CheckGitRead git_hash)
+    if (EXISTS ${POST_BUILD_DIR}/git-state.txt)
+        file(STRINGS ${POST_BUILD_DIR}/git-state.txt CONTENT)
+        LIST(GET CONTENT 0 var)
+        set(${git_hash} ${var} PARENT_SCOPE)
+    endif ()
+endfunction()
+
+function(CheckGitVersion)
+    # Get the latest abbreviated commit hash of the working branch
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-parse --short=8 HEAD
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_COMMIT_HASH_SHORT
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_COMMIT_HASH_FULL
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} log -1 --format=%at --date=local
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_TIMESTAMP
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND date -d @${BUILD_TIMESTAMP} +%Y%m%d
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_DATE
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_TAG
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    execute_process(
+            COMMAND bash -c "${GIT_EXECUTABLE} describe --tags --long | sed -E 's/(.*)-([0-9]+)-(\\w+)/\\2/g'"
+            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+            OUTPUT_VARIABLE BUILD_TAG_SEQ_NUM
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT DEFINED BUILD_TAG OR BUILD_TAG STREQUAL "")
+        set(BUILD_TAG "250228")
+    endif()
+    if(NOT DEFINED BUILD_TAG_SEQ_NUM OR BUILD_TAG_SEQ_NUM STREQUAL "")
+        set(BUILD_TAG_SEQ_NUM "1")
+    endif()
+    message(STATUS "Git Commit hash: ${BUILD_COMMIT_HASH_SHORT} ${BUILD_COMMIT_HASH_FULL}")
+    message(STATUS "Git Commit Date & Timestamp: ${BUILD_DATE} ${BUILD_TIMESTAMP}")
+    message(STATUS "Git Commit Tag & Seq Num: ${BUILD_TAG} ${BUILD_TAG_SEQ_NUM}")
+
+    set(BUILD_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+    set(BUILD_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+    set(BUILD_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+    set(BUILD_VERSION "${PROJECT_VERSION}")
+
+    CheckGitRead(GIT_HASH_CACHE)
+
+    if (NOT DEFINED GIT_HASH_CACHE)
+        set(GIT_HASH_CACHE "INVALID")
+    endif ()
+
+    if (NOT DEFINED BUILD_ON_RELEASE_BRANCH)
+      set(BUILD_ON_RELEASE_BRANCH "false")
+    endif ()
+
+    if (NOT DEFINED BUILD_PIPELINE_ID)
+        set(BUILD_PIPELINE_ID "999999")
+    endif()
+
+    # Only update the git_version.cpp if the hash has changed. This will
+    # prevent us from rebuilding the project more than we need to.
+    if (NOT ${BUILD_COMMIT_HASH_FULL} STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${POST_CONFIGURE_FILE})
+        # Set che GIT_HASH_CACHE variable the next build won't have
+        # to regenerate the source file.
+        CheckGitWrite("${BUILD_COMMIT_HASH_FULL}")
+        configure_file(${PRE_CONFIGURE_FILE} ${POST_CONFIGURE_FILE} @ONLY)
+    endif ()
+endfunction()
+
+function(CheckGitSetup project_src_dir)
+    add_custom_target(AlwaysCheckGit COMMAND ${CMAKE_COMMAND}
+        -DRUN_CHECK_GIT_VERSION=1
+        -DPRE_CONFIGURE_DIR=${PRE_CONFIGURE_DIR}
+        -DPOST_BUILD_DIR=${POST_BUILD_DIR}
+        -DGIT_HASH_CACHE=${GIT_HASH_CACHE}
+        -DPROJECT_VERSION_MAJOR=${PROJECT_VERSION_MAJOR}
+        -DPROJECT_VERSION_MINOR=${PROJECT_VERSION_MINOR}
+        -DPROJECT_VERSION_PATCH=${PROJECT_VERSION_PATCH}
+        -DPROJECT_VERSION=${PROJECT_VERSION}
+        -DPROJECT_SOURCE_DIR=${project_src_dir}
+        -P ${PROJECT_SOURCE_DIR}/cmake/GitVersion.cmake
+        DEPENDS ${PRE_CONFIGURE_FILE}
+        BYPRODUCTS ${POST_CONFIGURE_FILE}
+    )
+
+    add_library(version-info STATIC ${POST_CONFIGURE_FILE})
+    target_include_directories(version-info PUBLIC ${PROJECT_SOURCE_DIR}/src)
+    add_dependencies(version-info AlwaysCheckGit)
+    CheckGitVersion()
+endfunction()
+
+if (RUN_CHECK_GIT_VERSION)
+    CheckGitVersion()
+endif ()
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -0,0 +1,20 @@
+add_library(jemalloc INTERFACE)
+add_library(hf3fs_jemalloc_shared SHARED IMPORTED)
+
+include(ExternalProject)
+set(JEMALLOC_DIR "${CMAKE_BINARY_DIR}/third_party/jemalloc")
+
+ExternalProject_add(Hf3fsJemalloc_project
+  SOURCE_DIR "${PROJECT_SOURCE_DIR}/third_party/jemalloc"
+  BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h"
+  "${JEMALLOC_DIR}/lib/libjemalloc.so.2"
+  CONFIGURE_COMMAND ./autogen.sh && ./configure --prefix=${JEMALLOC_DIR} --disable-cxx --enable-prof --disable-initial-exec-tls
+  BUILD_IN_SOURCE ON
+  BUILD_COMMAND make -j 6
+  INSTALL_DIR "${JEMALLOC_DIR}"
+  INSTALL_COMMAND make install)
+
+add_dependencies(hf3fs_jemalloc_shared Hf3fsJemalloc_project)
+set_target_properties(hf3fs_jemalloc_shared PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.so.2")
+target_include_directories(hf3fs_jemalloc_shared INTERFACE "${JEMALLOC_DIR}/include")
+target_link_libraries(jemalloc INTERFACE hf3fs_jemalloc_shared)
--- a/cmake/Sanitizers.cmake
+++ b/cmake/Sanitizers.cmake
@@ -0,0 +1,242 @@
+#
+# Copyright (C) 2018-2022 by George Cave - gcave@stablecoder.ca
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(CheckCXXSourceCompiles)
+
+set(SANITIZER "" CACHE STRING "Sanitizer: ASAN, MSAN, UBSAN, TSAN")
+if(SANITIZER STREQUAL "ASAN")
+    message(STATUS "Build with Address Sanitizer.")
+    set(USE_SANITIZER "Address")
+    set(USE_ASAN ON)
+elseif(SANITIZER STREQUAL "MSAN")
+    message(STATUS "Build with Memory Sanitizer.")
+    set(USE_SANITIZER "Memory")
+    set(USE_MSAN ON)
+elseif(SANITIZER STREQUAL "UBSAN")
+    message(STATUS "Build with Undefined behavior Sanitizer.")
+    message(WARNING "With clang++ v14.0.6 undefined behavior sanitizer doesn't work with coroutine!!!")
+    set(USE_SANITIZER "Undefined")
+    set(USE_UBSAN ON)
+elseif(SANITIZER STREQUAL "TSAN")
+    message(STATUS "Build with Thread Sanitizer.")
+    set(USE_SANITIZER "Thread")
+    set(USE_TSAN ON)
+else()
+    message(STATUS "Sanitizer not enabled. ${SANITIZER}")
+    set(USE_SANITIZER "")
+endif()
+
+# set(USE_SANITIZER "" CACHE STRING
+#     "Compile with a sanitizer. Options are: Address, Memory, MemoryWithOrigins, Undefined, Thread, Leak, 'Address;Undefined', CFI"
+# )
+
+function(append value)
+    foreach(variable ${ARGN})
+        set(${variable}
+                "${${variable}} ${value}"
+                PARENT_SCOPE)
+    endforeach(variable)
+endfunction()
+
+function(append_quoteless value)
+    foreach(variable ${ARGN})
+        set(${variable}
+                ${${variable}} ${value}
+                PARENT_SCOPE)
+    endforeach(variable)
+endfunction()
+
+function(test_san_flags return_var flags)
+    set(QUIET_BACKUP ${CMAKE_REQUIRED_QUIET})
+    set(CMAKE_REQUIRED_QUIET TRUE)
+    unset(${return_var} CACHE)
+    set(FLAGS_BACKUP ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${flags}")
+    check_cxx_source_compiles("int main() { return 0; }" ${return_var})
+    set(CMAKE_REQUIRED_FLAGS "${FLAGS_BACKUP}")
+    set(CMAKE_REQUIRED_QUIET "${QUIET_BACKUP}")
+endfunction()
+
+if(USE_SANITIZER)
+    append("-fno-omit-frame-pointer" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+    unset(SANITIZER_SELECTED_FLAGS)
+
+    if(UNIX)
+        if(uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+            append("-O1" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Aa]ddress)")
+            # Optional: -fno-optimize-sibling-calls -fsanitize-address-use-after-scope
+            message(STATUS "Testing with Address sanitizer")
+            set(SANITIZER_ADDR_FLAG "-fsanitize=address")
+            test_san_flags(SANITIZER_ADDR_AVAILABLE ${SANITIZER_ADDR_FLAG})
+            if(SANITIZER_ADDR_AVAILABLE)
+                message(STATUS "  Building with Address sanitizer")
+                append("${SANITIZER_ADDR_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_ASAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR
+                        "Address sanitizer not available for ${CMAKE_CXX_COMPILER}")
+            endif()
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Mm]emory([Ww]ith[Oo]rigins)?)")
+            # Optional: -fno-optimize-sibling-calls -fsanitize-memory-track-origins=2
+            set(SANITIZER_MEM_FLAG "-fsanitize=memory")
+            if(USE_SANITIZER MATCHES "([Mm]emory[Ww]ith[Oo]rigins)")
+                message(STATUS "Testing with MemoryWithOrigins sanitizer")
+                append("-fsanitize-memory-track-origins" SANITIZER_MEM_FLAG)
+            else()
+                message(STATUS "Testing with Memory sanitizer")
+            endif()
+            test_san_flags(SANITIZER_MEM_AVAILABLE ${SANITIZER_MEM_FLAG})
+            if(SANITIZER_MEM_AVAILABLE)
+                if(USE_SANITIZER MATCHES "([Mm]emory[Ww]ith[Oo]rigins)")
+                    message(STATUS "  Building with MemoryWithOrigins sanitizer")
+                else()
+                    message(STATUS "  Building with Memory sanitizer")
+                endif()
+                append("${SANITIZER_MEM_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_MSAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR
+                        "Memory [With Origins] sanitizer not available for ${CMAKE_CXX_COMPILER}"
+                )
+            endif()
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Uu]ndefined)")
+            message(STATUS "Testing with Undefined Behaviour sanitizer")
+            set(SANITIZER_UB_FLAG "-fsanitize=undefined")
+            if(EXISTS "${BLACKLIST_FILE}")
+                append("-fsanitize-blacklist=${BLACKLIST_FILE}" SANITIZER_UB_FLAG)
+            endif()
+            test_san_flags(SANITIZER_UB_AVAILABLE ${SANITIZER_UB_FLAG})
+            if(SANITIZER_UB_AVAILABLE)
+                message(STATUS "  Building with Undefined Behaviour sanitizer")
+                append("${SANITIZER_UB_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_UBSAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR
+                        "Undefined Behaviour sanitizer not available for ${CMAKE_CXX_COMPILER}"
+                )
+            endif()
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Tt]hread)")
+            message(STATUS "Testing with Thread sanitizer")
+            set(SANITIZER_THREAD_FLAG "-fsanitize=thread -fsanitize-ignorelist=${CMAKE_SOURCE_DIR}/tsan_ignore.txt")
+            test_san_flags(SANITIZER_THREAD_AVAILABLE ${SANITIZER_THREAD_FLAG})
+            if(SANITIZER_THREAD_AVAILABLE)
+                message(STATUS "  Building with Thread sanitizer")
+                append("${SANITIZER_THREAD_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_TSAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR "Thread sanitizer not available for ${CMAKE_CXX_COMPILER}"
+                )
+            endif()
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Ll]eak)")
+            message(STATUS "Testing with Leak sanitizer")
+            set(SANITIZER_LEAK_FLAG "-fsanitize=leak")
+            test_san_flags(SANITIZER_LEAK_AVAILABLE ${SANITIZER_LEAK_FLAG})
+            if(SANITIZER_LEAK_AVAILABLE)
+                message(STATUS "  Building with Leak sanitizer")
+                append("${SANITIZER_LEAK_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_LSAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR "Thread sanitizer not available for ${CMAKE_CXX_COMPILER}"
+                )
+            endif()
+        endif()
+
+        if(USE_SANITIZER MATCHES "([Cc][Ff][Ii])")
+            message(STATUS "Testing with Control Flow Integrity(CFI) sanitizer")
+            set(SANITIZER_CFI_FLAG "-fsanitize=cfi")
+            test_san_flags(SANITIZER_CFI_AVAILABLE ${SANITIZER_CFI_FLAG})
+            if(SANITIZER_CFI_AVAILABLE)
+                message(STATUS "  Building with Control Flow Integrity(CFI) sanitizer")
+                append("${SANITIZER_LEAK_FLAG}" SANITIZER_SELECTED_FLAGS)
+
+                if(AFL)
+                    append_quoteless(AFL_USE_CFISAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                     CMAKE_CXX_COMPILER_LAUNCHER)
+                endif()
+            else()
+                message(
+                    FATAL_ERROR
+                        "Control Flow Integrity(CFI) sanitizer not available for ${CMAKE_CXX_COMPILER}"
+                )
+            endif()
+        endif()
+
+        message(STATUS "Sanitizer flags: ${SANITIZER_SELECTED_FLAGS}")
+        test_san_flags(SANITIZER_SELECTED_COMPATIBLE ${SANITIZER_SELECTED_FLAGS})
+        if(SANITIZER_SELECTED_COMPATIBLE)
+            message(STATUS " Building with ${SANITIZER_SELECTED_FLAGS}")
+            append("${SANITIZER_SELECTED_FLAGS}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+        else()
+            message(
+                FATAL_ERROR
+                    " Sanitizer flags ${SANITIZER_SELECTED_FLAGS} are not compatible.")
+        endif()
+    elseif(MSVC)
+        if(USE_SANITIZER MATCHES "([Aa]ddress)")
+            message(STATUS "Building with Address sanitizer")
+            append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+            if(AFL)
+                append_quoteless(AFL_USE_ASAN=1 CMAKE_C_COMPILER_LAUNCHER
+                                                 CMAKE_CXX_COMPILER_LAUNCHER)
+            endif()
+        else()
+            message(
+                FATAL_ERROR
+                    "This sanitizer not yet supported in the MSVC environment: ${USE_SANITIZER}"
+            )
+        endif()
+    else()
+        message(FATAL_ERROR "USE_SANITIZER is not supported on this platform.")
+    endif()
+
+endif()
--- a/cmake/Target.cmake
+++ b/cmake/Target.cmake
@@ -0,0 +1,109 @@
+# Check if IPO is supported
+include(CheckIPOSupported)
+check_ipo_supported(RESULT HAVE_IPO)
+
+# Enable IPO in non-debug build
+macro(target_enable_ipo NAME)
+    if(NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" AND HAVE_IPO)
+        set_property(TARGET ${NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+        message (STATUS "Enabled IPO for target: ${NAME}")
+    endif()
+endmacro()
+
+macro(target_add_lib NAME)
+    file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.h")
+    add_library(${NAME} STATIC ${FILES} ${FBS_FILES})
+    target_include_directories(${NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+            ${PROJECT_SOURCE_DIR}
+            ${PROJECT_BINARY_DIR}/src
+            ${PROJECT_BINARY_DIR}
+    )
+    target_link_libraries(${NAME} ${ARGN} "")
+endmacro()
+
+macro(target_add_shared_lib NAME)
+    file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.h")
+    add_library(${NAME} SHARED ${FILES} ${FBS_FILES})
+    target_include_directories(${NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+            ${PROJECT_SOURCE_DIR}
+            ${PROJECT_BINARY_DIR}/src
+            ${PROJECT_BINARY_DIR}
+    )
+    target_link_libraries(${NAME} ${ARGN} "")
+    target_enable_ipo(${NAME})
+endmacro()
+
+macro(target_add_bin NAME MAIN_FILE)
+  add_executable(${NAME} ${MAIN_FILE})
+    target_link_libraries(${NAME} ${ARGN} "")
+    target_include_directories(${NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+            ${PROJECT_SOURCE_DIR}
+            ${PROJECT_SOURCE_DIR}/src/lib/api
+            ${PROJECT_BINARY_DIR}/src
+            ${PROJECT_BINARY_DIR}
+    )
+    set_target_properties(${NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+    target_enable_ipo(${NAME})
+endmacro()
+
+macro(target_add_test NAME)
+    file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
+    add_executable(${NAME} ${FILES})
+    target_link_libraries(${NAME} gmock test_main ${ARGN} "")
+    target_include_directories(${NAME}
+        PUBLIC
+            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+            ${PROJECT_SOURCE_DIR}
+            ${PROJECT_BINARY_DIR}/src
+            ${PROJECT_BINARY_DIR}
+    )
+    add_test(NAME ${NAME} COMMAND ${NAME})
+    set_target_properties(${NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+endmacro()
+
+macro(target_add_fbs NAME PATH)
+    cmake_parse_arguments(FBS "SERVICE" "" "DEPS" ${ARGN})
+
+    message("Fbs " ${NAME} " FBS_SERVICE " ${FBS_SERVICE} " ARGN " ${ARGN})
+  
+    set(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS
+        --scoped-enums
+        --gen-object-api
+        --gen-mutable
+        --gen-compare
+        --cpp-std=c++17
+        --python
+        --hf3fs
+        --keep-prefix
+    )
+
+    get_filename_component(NAME_WE ${PATH} NAME_WE)
+    get_filename_component(DIR ${PATH} DIRECTORY)
+
+    build_flatbuffers(${PATH}
+        "${CMAKE_CURRENT_SOURCE_DIR}/${DIR};${CMAKE_SOURCE_DIR}/src"
+        "${NAME_WE}-generated"
+        ""
+        "${CMAKE_CURRENT_BINARY_DIR}/${DIR}"
+        ""
+        ""
+        )
+
+      add_library(${NAME} INTERFACE)
+      target_link_libraries(${NAME} INTERFACE common)
+      target_include_directories(${NAME}
+          INTERFACE
+              $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
+              ${PROJECT_SOURCE_DIR}
+              ${PROJECT_BINARY_DIR}/src
+              ${PROJECT_BINARY_DIR}
+      )
+
+    add_dependencies(${NAME} "${NAME_WE}-generated" ${FBS_DEPS})
+endmacro()
--- a/configs/admin_cli.toml
+++ b/configs/admin_cli.toml
@@ -0,0 +1,338 @@
+break_multi_line_command_on_failure = false
+cluster_id = ''
+log = 'DBG:normal; normal=file:path=/var/log/3fs/cli.log,async=true,sync_level=ERR'
+num_timeout_ms = 1000
+profile = false
+verbose = false
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[fdb]
+casual_read_risky = false
+clusterFile = ''
+default_backoff = 0
+enableMultipleClient = false
+externalClientDir = ''
+externalClientPath = ''
+multipleClientThreadNum = 4
+readonly = false
+trace_file = ''
+trace_format = 'json'
+
+[ib_devices]
+allow_no_usable_devices = false
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_roce_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+prefer_ibdevice = true
+skip_inactive_ports = true
+skip_unusable_device = true
+subnets = []
+
+[meta_client]
+check_server_interval = '5s'
+dynamic_stripe = false
+max_concurrent_requests = 128
+network_type = 'RDMA'
+remove_chunks_batch_size = 32
+remove_chunks_max_iters = 1024
+selection_mode = 'RandomFollow'
+
+[meta_client.background_closer]
+prune_session_batch_count = 128
+prune_session_batch_interval = '10s'
+retry_first_wait = '100ms'
+retry_max_wait = '10s'
+task_scan = '50ms'
+
+[meta_client.background_closer.coroutine_pool]
+coroutines_num = 8
+enable_work_stealing = false
+queue_size = 128
+
+[meta_client.retry_default]
+max_failures_before_failover = 1
+retry_fast = '1s'
+retry_init_wait = '500ms'
+retry_max_wait = '5s'
+retry_send = 1
+retry_total_time = '1min'
+rpc_timeout = '5s'
+
+[mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '1s'
+enable_auto_extend_client_session = false
+enable_auto_heartbeat = false
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
+
+[monitor]
+collect_period = '1s'
+num_collectors = 1
+reporters = []
+
+[storage_client]
+check_overlapping_read_buffers = true
+check_overlapping_write_buffers = false
+chunk_checksum_type = 'CRC32C'
+create_net_client_for_updates = false
+implementation_type = 'RPC'
+max_inline_read_bytes = '0'
+max_inline_write_bytes = '0'
+max_read_io_bytes = '0'
+
+[storage_client.net_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[storage_client.net_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[storage_client.net_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[storage_client.net_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[storage_client.net_client.io_worker.transport_pool]
+max_connections = 1
+
+[storage_client.net_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[storage_client.net_client.rdma_control]
+max_concurrent_transmission = 64
+
+[storage_client.net_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[storage_client.net_client_for_updates]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[storage_client.net_client_for_updates.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[storage_client.net_client_for_updates.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[storage_client.net_client_for_updates.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[storage_client.net_client_for_updates.io_worker.transport_pool]
+max_connections = 1
+
+[storage_client.net_client_for_updates.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[storage_client.net_client_for_updates.rdma_control]
+max_concurrent_transmission = 64
+
+[storage_client.net_client_for_updates.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[storage_client.retry]
+init_wait_time = '10s'
+max_failures_before_failover = 1
+max_retry_time = '1min'
+max_wait_time = '30s'
+
+[storage_client.traffic_control.query]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage_client.traffic_control.read]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage_client.traffic_control.remove]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage_client.traffic_control.truncate]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage_client.traffic_control.write]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[user_info]
+gid = -1
+gids = []
+token = ''
+uid = -1
--- a/configs/hf3fs_client_agent.toml
+++ b/configs/hf3fs_client_agent.toml
@@ -0,0 +1,557 @@
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.handlers]]
+async = true
+file_path = ''
+max_file_size = '10MB'
+max_files = 100
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = ''
+max_file_size = '10MB'
+max_files = 100
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = ''
+max_file_size = '10MB'
+max_files = 100
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[common.memory]
+prof_active = false
+prof_prefix = ''
+
+[common.monitor]
+reporters = []
+
+[server.agent]
+allow_read_holes = true
+auth_timeout = '5min'
+background_ibreg = true
+list_entry_limit = 100
+max_concurrent_iovallocs = 0
+mock_storage_dir = ''
+mount_name = ''
+read_only_mode = false
+truncate_if_write_after_eof = false
+
+[server.agent.inode_cache]
+capacity = 4194304
+entry_lifetime = '5min'
+
+[server.agent.limit_per_process]
+fd = 1048576
+shm = 1048576
+
+[server.agent.periodic_sync]
+interval = '10min'
+on = false
+
+[server.agent.proc_watch]
+interval = '1min'
+on = true
+
+[server.agent.storage_io.read]
+enableChecksum = false
+
+[server.agent.storage_io.read.debug]
+bypass_disk_io = false
+bypass_rdma_xmit = false
+inject_random_client_error = false
+inject_random_server_error = false
+max_num_of_injection_points = 100
+
+[server.agent.storage_io.read.retry]
+init_wait_time = '0ns'
+max_retry_time = '0ns'
+max_wait_time = '0ns'
+retry_permanent_error = false
+
+[server.agent.storage_io.read.targetSelection]
+mode = 'Default'
+targetIndex = 0
+trafficZone = ''
+
+[server.agent.storage_io.write]
+enableChecksum = true
+
+[server.agent.storage_io.write.debug]
+bypass_disk_io = false
+bypass_rdma_xmit = false
+inject_random_client_error = false
+inject_random_server_error = false
+max_num_of_injection_points = 100
+
+[server.agent.storage_io.write.retry]
+init_wait_time = '0ns'
+max_retry_time = '0ns'
+max_wait_time = '0ns'
+retry_permanent_error = false
+
+[server.agent.storage_io.write.targetSelection]
+mode = 'Default'
+targetIndex = 0
+trafficZone = ''
+
+[server.agent.storage_ops]
+enable_read = true
+enable_write = true
+
+[server.background_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.background_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.background_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.background_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.background_client.io_worker.transport_pool]
+max_connections = 1
+
+[server.background_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.background_client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.background_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.independent_thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 64
+num_proc_threads = 64
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'LOCAL'
+services = [ 'ClientAgentSerde' ]
+use_independent_thread_pool = false
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+filter_list = []
+listen_port = 0
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'TCP'
+services = [ 'Core' ]
+use_independent_thread_pool = true
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+filter_list = []
+listen_port = 9000
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.meta]
+check_server_interval = '5s'
+dynamic_stripe = false
+max_concurrent_requests = 128
+network_type = 'RDMA'
+remove_chunks_batch_size = 32
+remove_chunks_max_iters = 1024
+selection_mode = 'RandomFollow'
+
+[server.meta.background_closer]
+prune_session_batch_count = 128
+prune_session_batch_interval = '10s'
+retry_first_wait = '100ms'
+retry_max_wait = '10s'
+task_scan = '50ms'
+
+[server.meta.background_closer.coroutine_pool]
+coroutines_num = 8
+enable_work_stealing = false
+queue_size = 128
+
+[server.meta.retry_default]
+max_failures_before_failover = 1
+retry_init_wait = '500ms'
+retry_max_wait = '5s'
+retry_send = 1
+retry_total_time = '1min'
+rpc_timeout = '2s'
+
+[server.meta.retry_truncate]
+max_failures_before_failover = 1
+retry_init_wait = '2s'
+retry_max_wait = '5s'
+retry_send = 1
+retry_total_time = '1min'
+rpc_timeout = '15s'
+
+[server.mgmtd]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = true
+enable_auto_heartbeat = false
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
+
+[server.storage]
+check_overlapping_read_buffers = true
+check_overlapping_write_buffers = false
+chunk_checksum_type = 'CRC32C'
+create_net_client_for_updates = false
+implementation_type = 'RPC'
+max_inline_read_bytes = '0'
+max_inline_write_bytes = '0'
+max_read_io_bytes = '0'
+
+[server.storage.net_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.storage.net_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.storage.net_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.storage.net_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.storage.net_client.io_worker.transport_pool]
+max_connections = 1
+
+[server.storage.net_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.storage.net_client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.storage.net_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.storage.net_client_for_updates]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.storage.net_client_for_updates.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.storage.net_client_for_updates.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.storage.net_client_for_updates.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.storage.net_client_for_updates.io_worker.transport_pool]
+max_connections = 1
+
+[server.storage.net_client_for_updates.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.storage.net_client_for_updates.rdma_control]
+max_concurrent_transmission = 64
+
+[server.storage.net_client_for_updates.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.storage.retry]
+init_wait_time = '10s'
+max_failures_before_failover = 1
+max_retry_time = '1min'
+max_wait_time = '30s'
+
+[server.storage.traffic_control.query]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage.traffic_control.read]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage.traffic_control.remove]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage.traffic_control.truncate]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage.traffic_control.write]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
--- a/configs/hf3fs_client_agent_app.toml
+++ b/configs/hf3fs_client_agent_app.toml
@@ -0,0 +1 @@
+
--- a/configs/hf3fs_client_agent_launcher.toml
+++ b/configs/hf3fs_client_agent_launcher.toml
@@ -0,0 +1,88 @@
+allow_dev_version = true
+cluster_id = ''
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[ib_devices]
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+skip_inactive_ports = true
+subnets = []
+
+[mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = true
+enable_auto_heartbeat = false
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
--- a/configs/hf3fs_fuse_main.toml
+++ b/configs/hf3fs_fuse_main.toml
@@ -0,0 +1,450 @@
+attr_timeout = 30.0
+batch_io_coros = 128
+check_rmrf = true
+chunk_size_limit = '0'
+dryrun_bench_mode = false
+enable_interrupt = false
+enable_priority = false
+enable_read_cache = true
+enable_writeback_cache = false
+entry_timeout = 30.0
+fdatasync_update_length = false
+flush_on_stat = true
+fsync_length_hint = false
+io_job_deq_timeout = '1ms'
+io_jobq_size = 1024
+iov_limit = '1MB'
+max_background = 32
+max_idle_threads = 10
+max_jobs_per_ioring = 32
+max_readahead = '16MB'
+max_threads = 256
+max_uid = '1M'
+memset_before_read = false
+negative_timeout = 5.0
+notify_inval_threads = 32
+rdma_buf_pool_size = 1024
+readonly = false
+submit_wait_jitter = '1ms'
+symlink_timeout = 5.0
+sync_on_stat = true
+time_granularity = '1s'
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/hf3fs_fuse_main.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/hf3fs_fuse_main-err.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/hf3fs_fuse_main-fatal.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[common.memory]
+prof_active = false
+prof_prefix = ''
+
+[common.monitor]
+collect_period = '1s'
+num_collectors = 1
+
+[[common.monitor.reporters]]
+type = 'monitor_collector'
+
+[common.monitor.reporters.monitor_collector]
+remote_ip = ''
+
+[io_bufs]
+max_buf_size = '1MB'
+max_readahead = '256KB'
+write_buf_size = '1MB'
+
+[io_jobq_sizes]
+hi = 32
+lo = 4096
+
+[io_worker_coros]
+hi = 8
+lo = 8
+
+[meta]
+check_server_interval = '5s'
+dynamic_stripe = true
+max_concurrent_requests = 128
+network_type = 'RDMA'
+remove_chunks_batch_size = 32
+remove_chunks_max_iters = 1024
+selection_mode = 'RandomFollow'
+
+[meta.background_closer]
+prune_session_batch_count = 128
+prune_session_batch_interval = '10s'
+retry_first_wait = '100ms'
+retry_max_wait = '10s'
+task_scan = '50ms'
+
+[meta.background_closer.coroutine_pool]
+coroutines_num = 8
+enable_work_stealing = false
+queue_size = 128
+
+[meta.retry_default]
+max_failures_before_failover = 1
+retry_fast = '1s'
+retry_init_wait = '500ms'
+retry_max_wait = '5s'
+retry_send = 1
+retry_total_time = '1min'
+rpc_timeout = '5s'
+
+[mgmtd]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = true
+enable_auto_heartbeat = false
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
+
+[periodic_sync]
+enable = true
+flush_write_buf = true
+interval = '30s'
+limit = 1000
+
+[periodic_sync.worker]
+coroutines_num = 4
+enable_work_stealing = false
+queue_size = 1024
+
+[storage]
+check_overlapping_read_buffers = true
+check_overlapping_write_buffers = false
+chunk_checksum_type = 'CRC32C'
+create_net_client_for_updates = false
+implementation_type = 'RPC'
+max_inline_read_bytes = '0'
+max_inline_write_bytes = '0'
+max_read_io_bytes = '0'
+
+[storage.net_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[storage.net_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[storage.net_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[storage.net_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[storage.net_client.io_worker.transport_pool]
+max_connections = 1
+
+[storage.net_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[storage.net_client.rdma_control]
+max_concurrent_transmission = 64
+
+[storage.net_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[storage.net_client_for_updates]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[storage.net_client_for_updates.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[storage.net_client_for_updates.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[storage.net_client_for_updates.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[storage.net_client_for_updates.io_worker.transport_pool]
+max_connections = 1
+
+[storage.net_client_for_updates.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[storage.net_client_for_updates.rdma_control]
+max_concurrent_transmission = 64
+
+[storage.net_client_for_updates.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[storage.retry]
+init_wait_time = '10s'
+max_failures_before_failover = 1
+max_retry_time = '1min'
+max_wait_time = '30s'
+
+[storage.traffic_control.query]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage.traffic_control.read]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage.traffic_control.remove]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage.traffic_control.truncate]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage.traffic_control.write]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[storage_io.read]
+allowReadUncommitted = false
+enableChecksum = false
+
+[storage_io.read.debug]
+bypass_disk_io = false
+bypass_rdma_xmit = false
+inject_random_client_error = false
+inject_random_server_error = false
+max_num_of_injection_points = 100
+
+[storage_io.read.retry]
+init_wait_time = '0ns'
+max_retry_time = '0ns'
+max_wait_time = '0ns'
+retry_permanent_error = false
+
+[storage_io.read.targetSelection]
+mode = 'Default'
+targetIndex = 0
+trafficZone = ''
+
+[storage_io.write]
+enableChecksum = true
+
+[storage_io.write.debug]
+bypass_disk_io = false
+bypass_rdma_xmit = false
+inject_random_client_error = false
+inject_random_server_error = false
+max_num_of_injection_points = 100
+
+[storage_io.write.retry]
+init_wait_time = '0ns'
+max_retry_time = '0ns'
+max_wait_time = '0ns'
+retry_permanent_error = false
+
+[storage_io.write.targetSelection]
+mode = 'Default'
+targetIndex = 0
+trafficZone = ''
--- a/configs/hf3fs_fuse_main_app.toml
+++ b/configs/hf3fs_fuse_main_app.toml
@@ -0,0 +1 @@
+
--- a/configs/hf3fs_fuse_main_launcher.toml
+++ b/configs/hf3fs_fuse_main_launcher.toml
@@ -0,0 +1,95 @@
+allow_other = true
+cluster_id = ''
+mountpoint = ''
+token_file = ''
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[ib_devices]
+allow_no_usable_devices = false
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_roce_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+prefer_ibdevice = true
+skip_inactive_ports = true
+skip_unusable_device = true
+subnets = []
+
+[mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = true
+enable_auto_heartbeat = false
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
--- a/configs/meta_main.toml
+++ b/configs/meta_main.toml
@@ -0,0 +1,633 @@
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.categories]]
+categories = [ 'eventlog' ]
+handlers = [ 'event' ]
+inherit = false
+level = 'INFO'
+propagate = 'ERR'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/hf3fs_meta_main.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/hf3fs_meta_main-err.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/hf3fs_meta_main-fatal.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/hf3fs_meta_main-event.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'event'
+rotate = true
+rotate_on_open = false
+start_level = 'INFO'
+stream_type = 'STDERR'
+writer_type = 'EVENT'
+
+[common.memory]
+prof_active = false
+prof_prefix = ''
+
+[common.monitor]
+collect_period = '1s'
+num_collectors = 1
+
+[[common.monitor.reporters]]
+type = 'monitor_collector'
+
+[common.monitor.reporters.monitor_collector]
+remote_ip = ''
+
+[server]
+use_memkv = false
+
+[server.background_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.background_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.background_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.background_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.background_client.io_worker.transport_pool]
+max_connections = 1
+
+[server.background_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.background_client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.background_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.independent_thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'RDMA'
+services = [ 'MetaSerde' ]
+use_independent_thread_pool = false
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 8001
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'TCP'
+services = [ 'Core' ]
+use_independent_thread_pool = true
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 9001
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.fdb]
+casual_read_risky = false
+clusterFile = ''
+default_backoff = 0
+enableMultipleClient = false
+externalClientDir = ''
+externalClientPath = ''
+multipleClientThreadNum = 4
+readonly = false
+trace_file = ''
+trace_format = 'json'
+
+[server.kv_engine]
+use_memkv = false
+
+[server.kv_engine.fdb]
+casual_read_risky = false
+clusterFile = ''
+default_backoff = 0
+enableMultipleClient = false
+externalClientDir = ''
+externalClientPath = ''
+multipleClientThreadNum = 4
+readonly = false
+trace_file = ''
+trace_format = 'json'
+
+[server.meta]
+acl_cache_time = '15s'
+allow_directly_move_to_trash = false
+allow_owner_change_immutable = false
+allow_stat_deleted_inodes = true
+authenticate = false
+batch_stat_by_path_concurrent = 4
+batch_stat_concurrent = 8
+check_file_hole = false
+dynamic_stripe = false
+dynamic_stripe_growth = 2
+dynamic_stripe_initial = 16
+enable_new_chunk_engine = false
+grv_cache = false
+idempotent_record_clean = '1min'
+idempotent_record_expire = '30min'
+idempotent_remove = true
+idempotent_rename = false
+iflags_chain_allocation = false
+iflags_chunk_engine = true
+ignore_length_hint = false
+inodeId_abort_on_duplicate = false
+inodeId_check_unique = true
+list_default_limit = 128
+max_batch_operations = 4096
+max_directory_depth = 64
+max_remove_chunks_per_request = 32
+max_symlink_count = 10
+max_symlink_depth = 4
+operation_timeout = '5s'
+otrunc_replace_file = true
+otrunc_replace_file_threshold = '1GB'
+readonly = false
+recursive_remove_check_owner = true
+recursive_remove_perm_check = 1024
+statfs_cache_time = '1min'
+statfs_space_imbalance_threshold = 5
+statfs_update_interval = '5s'
+sync_on_prune_session = false
+time_granularity = '1s'
+
+[server.meta.background_hole_checker]
+coroutines_num = 16
+enable_work_stealing = false
+queue_size = 4096
+
+[server.meta.distributor]
+timeout = '30s'
+update_interval = '1s'
+
+[server.meta.event_trace_log]
+dump_interval = '30s'
+enabled = true
+max_num_writers = 1
+max_row_group_length = 100000
+trace_file_dir = '.'
+
+[server.meta.forward]
+addr_type = 'RDMA'
+debug = true
+timeout = '10s'
+
+[server.meta.gc]
+check_session = true
+distributed_gc = true
+enable = true
+gc_delay_free_space_threshold = 5
+gc_directory_concurrent = 4
+gc_directory_delay = '0ns'
+gc_directory_entry_batch = 32
+gc_directory_entry_concurrent = 4
+gc_file_concurrent = 32
+gc_file_delay = '5min'
+large_file_chunks = 128
+recursive_perm_check = true
+remove_chunks_batch_size = 32
+retry_delay = '10min'
+scan_batch = 4096
+scan_interval = '200ms'
+small_file_chunks = 32
+txn_low_priority = false
+
+[server.meta.gc.retry_remove_chunks]
+init_wait_time = '10s'
+max_retry_time = '30s'
+max_wait_time = '10s'
+retry_permanent_error = false
+
+[server.meta.gc.workers]
+coroutines_num = 8
+enable_work_stealing = false
+queue_size = 1024
+
+[server.meta.retry_remove_chunks]
+init_wait_time = '10s'
+max_retry_time = '30s'
+max_wait_time = '10s'
+retry_permanent_error = false
+
+[server.meta.retry_transaction]
+max_backoff = '1s'
+max_retry_count = 10
+
+[server.meta.session_manager]
+enable = true
+scan_batch = 1024
+scan_interval = '5min'
+session_timeout = '5min'
+sync_on_prune_session = false
+
+[server.meta.session_manager.close_workers]
+coroutines_num = 32
+enable_work_stealing = false
+queue_size = 1024
+
+[server.meta.session_manager.scan_workers]
+coroutines_num = 8
+enable_work_stealing = false
+queue_size = 128
+
+[server.meta.user_cache]
+buckets = 127
+exist_ttl = '5min'
+inexist_ttl = '10s'
+
+[server.mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = false
+enable_auto_heartbeat = true
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
+
+[server.storage_client]
+check_overlapping_read_buffers = true
+check_overlapping_write_buffers = false
+chunk_checksum_type = 'CRC32C'
+create_net_client_for_updates = false
+implementation_type = 'RPC'
+max_inline_read_bytes = '0'
+max_inline_write_bytes = '0'
+max_read_io_bytes = '0'
+
+[server.storage_client.net_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.storage_client.net_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.storage_client.net_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.storage_client.net_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.storage_client.net_client.io_worker.transport_pool]
+max_connections = 1
+
+[server.storage_client.net_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.storage_client.net_client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.storage_client.net_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.storage_client.net_client_for_updates]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.storage_client.net_client_for_updates.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.storage_client.net_client_for_updates.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.storage_client.net_client_for_updates.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.storage_client.net_client_for_updates.io_worker.transport_pool]
+max_connections = 1
+
+[server.storage_client.net_client_for_updates.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.storage_client.net_client_for_updates.rdma_control]
+max_concurrent_transmission = 64
+
+[server.storage_client.net_client_for_updates.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.storage_client.retry]
+init_wait_time = '2s'
+max_failures_before_failover = 1
+max_retry_time = '5s'
+max_wait_time = '5s'
+
+[server.storage_client.traffic_control.query]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage_client.traffic_control.read]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage_client.traffic_control.remove]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage_client.traffic_control.truncate]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
+
+[server.storage_client.traffic_control.write]
+max_batch_bytes = '4MB'
+max_batch_size = 128
+max_concurrent_requests = 32
+max_concurrent_requests_per_server = 8
+process_batches_in_parallel = true
+random_shuffle_requests = true
--- a/configs/meta_main_app.toml
+++ b/configs/meta_main_app.toml
@@ -0,0 +1,2 @@
+allow_empty_node_id = true
+node_id = 0
--- a/configs/meta_main_launcher.toml
+++ b/configs/meta_main_launcher.toml
@@ -0,0 +1,93 @@
+allow_dev_version = true
+cluster_id = ''
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[ib_devices]
+allow_no_usable_devices = false
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_roce_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+prefer_ibdevice = true
+skip_inactive_ports = true
+skip_unusable_device = true
+subnets = []
+
+[mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = false
+enable_auto_heartbeat = true
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
--- a/configs/mgmtd_main.toml
+++ b/configs/mgmtd_main.toml
@@ -0,0 +1,232 @@
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/mgmtd_main.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/mgmtd_main-err.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/mgmtd_main-fatal.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[common.memory]
+prof_active = false
+prof_prefix = ''
+
+[common.monitor]
+collect_period = '1s'
+num_collectors = 1
+
+[[common.monitor.reporters]]
+type = 'monitor_collector'
+
+[common.monitor.reporters.monitor_collector]
+remote_ip = ""
+
+[server.base.independent_thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'RDMA'
+services = [ 'Mgmtd' ]
+use_independent_thread_pool = false
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 8000
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'TCP'
+services = [ 'Core' ]
+use_independent_thread_pool = true
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 9000
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.service]
+allow_heartbeat_from_unregistered = true
+authenticate = false
+bootstrapping_length = '2min'
+bump_routing_info_version_interval = '5s'
+check_status_interval = '10s'
+client_session_timeout = '20min'
+enable_routinginfo_cache = true
+extend_lease_check_release_version = true
+extend_lease_interval = '10s'
+heartbeat_fail_interval = '1min'
+heartbeat_ignore_stale_targets = true
+heartbeat_ignore_unknown_targets = false
+heartbeat_timestamp_valid_window = '30s'
+lease_length = '1min'
+new_chain_bootstrap_interval = '2min'
+only_accept_client_uuid = false
+retry_times_on_txn_errors = -1
+send_heartbeat = true
+send_heartbeat_interval = '10s'
+suspicious_lease_interval = '20s'
+target_info_load_interval = '1s'
+target_info_persist_batch = 1000
+target_info_persist_interval = '1s'
+try_adjust_target_order_as_preferred = false
+update_chains_interval = '1s'
+update_metrics_interval = '1s'
+validate_lease_on_write = true
+
+[server.service.retry_transaction]
+max_backoff = '1s'
+max_retry_count = 10
+
+[server.service.user_cache]
+buckets = 127
+exist_ttl = '5min'
+inexist_ttl = '10s'
--- a/configs/mgmtd_main_app.toml
+++ b/configs/mgmtd_main_app.toml
@@ -0,0 +1,2 @@
+allow_empty_node_id = true
+node_id = 0
--- a/configs/mgmtd_main_launcher.toml
+++ b/configs/mgmtd_main_launcher.toml
@@ -0,0 +1,44 @@
+allow_dev_version = true
+cluster_id = ''
+use_memkv = false
+
+[fdb]
+casual_read_risky = false
+clusterFile = ''
+default_backoff = 0
+enableMultipleClient = false
+externalClientDir = ''
+externalClientPath = ''
+multipleClientThreadNum = 4
+readonly = false
+trace_file = ''
+trace_format = 'json'
+
+[ib_devices]
+allow_no_usable_devices = false
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_roce_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+prefer_ibdevice = true
+skip_inactive_ports = true
+skip_unusable_device = true
+subnets = []
+
+[kv_engine]
+use_memkv = false
+
+[kv_engine.fdb]
+casual_read_risky = false
+clusterFile = ''
+default_backoff = 0
+enableMultipleClient = false
+externalClientDir = ''
+externalClientPath = ''
+multipleClientThreadNum = 4
+readonly = false
+trace_file = ''
+trace_format = 'json'
--- a/configs/monitor_collector_main.toml
+++ b/configs/monitor_collector_main.toml
@@ -0,0 +1,141 @@
+[common]
+cluster_id = ''
+
+[common.ib_devices]
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+device_filter = []
+subnets = []
+
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/monitor_collector_main.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/monitor_collector_main-err.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/monitor_collector_main-fatal.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[server.base.independent_thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[server.base.groups]]
+#default_timeout = '1s'
+#drop_connections_interval = '1h'
+network_type = 'TCP'
+services = [ 'MonitorCollector' ]
+use_independent_thread_pool = false
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drop_connections = 0
+event_ack_batch = 128
+#gid_index = 0
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+pkey_index = 0
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+traffic_class = 0
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+filter_list = []
+listen_port = 10000
+listen_queue_depth = 4096
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+
+[server.monitor_collector]
+batch_commit_size = 4096
+conn_threads = 32
+queue_capacity = 204800
+
+[server.monitor_collector.reporter]
+type = 'clickhouse'
+
+[server.monitor_collector.reporter.clickhouse]
+db = ''
+host = ''
+passwd = ''
+port = ''
+user = ''
--- a/configs/storage_main.toml
+++ b/configs/storage_main.toml
@@ -0,0 +1,509 @@
+[[common.log.categories]]
+categories = [ '.' ]
+handlers = [ 'normal', 'err', 'fatal' ]
+inherit = true
+level = 'INFO'
+propagate = 'NONE'
+
+[[common.log.handlers]]
+async = true
+file_path = '/var/log/3fs/storage_main.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'normal'
+rotate = true
+rotate_on_open = false
+start_level = 'NONE'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/storage_main-err.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'err'
+rotate = true
+rotate_on_open = false
+start_level = 'ERR'
+stream_type = 'STDERR'
+writer_type = 'FILE'
+
+[[common.log.handlers]]
+async = false
+file_path = '/var/log/3fs/storage_main-fatal.log'
+max_file_size = '100MB'
+max_files = 10
+name = 'fatal'
+rotate = true
+rotate_on_open = false
+start_level = 'FATAL'
+stream_type = 'STDERR'
+writer_type = 'STREAM'
+
+[common.memory]
+prof_active = false
+prof_prefix = ''
+
+[common.monitor]
+collect_period = '1s'
+num_collectors = 1
+
+[[common.monitor.reporters]]
+type = 'monitor_collector'
+
+[common.monitor.reporters.monitor_collector]
+remote_ip = ""
+
+[server]
+speed_up_quit = true
+use_coroutines_pool_read = true
+use_coroutines_pool_update = true
+
+[server.aio_read_worker]
+enable_io_uring = true
+inflight_control_offset = 128
+ioengine = 'libaio'
+max_events = 512
+min_complete = 128
+num_threads = 32
+queue_size = 4096
+wait_all_inflight = false
+
+[server.allocate_worker]
+max_remain_groups = 8
+max_remain_ultra_groups = 4
+max_reserved_chunks = '1GB'
+min_remain_groups = 4
+min_remain_ultra_groups = 0
+
+[server.base.independent_thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.base.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 32
+num_proc_threads = 32
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'RDMA'
+services = [ 'StorageSerde' ]
+use_independent_thread_pool = false
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 8000
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[[server.base.groups]]
+check_connections_interval = '1min'
+connection_expiration_time = '1day'
+network_type = 'TCP'
+services = [ 'Core' ]
+use_independent_thread_pool = true
+
+[server.base.groups.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.base.groups.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.base.groups.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.base.groups.io_worker.transport_pool]
+max_connections = 1
+
+[server.base.groups.listener]
+domain_socket_index = 1
+filter_list = []
+listen_port = 9000
+listen_queue_depth = 4096
+rdma_accept_timeout = '15s'
+rdma_listen_ethernet = true
+reuse_port = false
+
+[server.base.groups.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.buffer_pool]
+big_rdmabuf_count = 64
+big_rdmabuf_size = '64MB'
+rdmabuf_count = 1024
+rdmabuf_size = '4MB'
+
+[server.check_worker]
+disk_low_space_threshold = 0.95999999999999996
+disk_reject_create_chunk_threshold = 0.97999999999999998
+emergency_recycling_ratio = 0.94999999999999996
+update_target_size_interval = '10s'
+
+[server.client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.client.io_worker.transport_pool]
+max_connections = 1
+
+[server.client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.coroutines_pool_default]
+coroutines_num = 64
+queue_size = 1024
+threads_num = 8
+
+[server.coroutines_pool_read]
+coroutines_num = 64
+queue_size = 1024
+threads_num = 8
+
+[server.coroutines_pool_sync]
+coroutines_num = 64
+queue_size = 1024
+threads_num = 8
+
+[server.coroutines_pool_update]
+coroutines_num = 64
+queue_size = 1024
+threads_num = 8
+
+[server.dump_worker]
+dump_interval = '1day'
+dump_root_path = ''
+high_cpu_usage_threshold = 100
+
+[server.forward_client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[server.forward_client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[server.forward_client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[server.forward_client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[server.forward_client.io_worker.transport_pool]
+max_connections = 1
+
+[server.forward_client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[server.forward_client.rdma_control]
+max_concurrent_transmission = 64
+
+[server.forward_client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[server.mgmtd]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = false
+enable_auto_heartbeat = true
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
+
+[server.reliable_forwarding]
+max_inline_forward_bytes = '0'
+retry_first_wait = '100ms'
+retry_max_wait = '1s'
+retry_total_time = '1min'
+
+[server.reliable_update]
+clean_up_expired_clients = false
+expired_clients_timeout = '1h'
+
+[server.storage]
+apply_transmission_before_getting_semaphore = true
+batch_read_ignore_chain_version = false
+batch_read_job_split_size = 1024
+max_concurrent_rdma_reads = 256
+max_concurrent_rdma_writes = 256
+max_num_results_per_query = 100
+post_buffer_per_bytes = '64KB'
+rdma_transmission_req_timeout = '0ns'
+read_only = false
+
+[server.storage.event_trace_log]
+dump_interval = '30s'
+enabled = true
+max_num_writers = 1
+max_row_group_length = 100000
+trace_file_dir = '.'
+
+[server.storage.write_worker]
+bg_num_threads = 8
+num_threads = 32
+queue_size = 4096
+
+[server.sync_meta_kv_worker]
+sync_meta_kv_interval = '1min'
+
+[server.sync_worker]
+batch_size = 16
+full_sync_chains = []
+full_sync_level = 'NONE'
+num_channels = 1024
+num_threads = 16
+sync_start_timeout = '10s'
+
+[server.sync_worker.batch_concurrency_limiter]
+max_concurrency = 64
+
+[server.sync_worker.pool]
+coroutines_num = 64
+enable_work_stealing = false
+queue_size = 1024
+
+[server.targets]
+allow_disk_without_uuid = false
+collect_all_fds = true
+create_engine_path = true
+space_info_cache_timeout = '5s'
+target_num_per_path = 0
+target_paths = []
+
+[server.targets.storage_target]
+force_persist = true
+kv_path = ''
+migrate_kv_store = false
+mutex_num = 257
+point_query_strategy = 'NONE'
+
+[server.targets.storage_target.file_store]
+preopen_chunk_size_list = []
+
+[server.targets.storage_target.kv_store]
+create_if_missing = false
+integrate_leveldb_log = false
+leveldb_block_cache_size = '8GB'
+leveldb_iterator_fill_cache = true
+leveldb_shared_block_cache = true
+leveldb_sst_file_size = '16MB'
+leveldb_write_buffer_size = '16MB'
+rocksdb_avoid_flush_during_recovery = false
+rocksdb_avoid_flush_during_shutdown = false
+rocksdb_avoid_unnecessary_blocking_io = false
+rocksdb_block_cache_size = '8GB'
+rocksdb_block_size = '4KB'
+rocksdb_bloom_filter_bits_per_key = 10
+rocksdb_compression = 'kNoCompression'
+rocksdb_enable_bloom_filter = true
+rocksdb_enable_pipelined_write = false
+rocksdb_enable_prefix_transform = true
+rocksdb_keep_log_file_num = 10
+rocksdb_level0_file_num_compaction_trigger = 4
+rocksdb_lowest_used_cache_tier = 'kNonVolatileBlockTier'
+rocksdb_max_manifest_file_size = '64MB'
+rocksdb_num_levels = 7
+rocksdb_prepopulate_block_cache = 'kDisable'
+rocksdb_readahead_size = '2MB'
+rocksdb_shared_block_cache = true
+rocksdb_stats_dump_period = '2min'
+rocksdb_target_file_size_base = '64MB'
+rocksdb_target_file_size_multiplier = 1
+rocksdb_threads_num = 8
+rocksdb_unordered_write = false
+rocksdb_wal_recovery_mode = 'kTolerateCorruptedTailRecords'
+rocksdb_write_buffer_size = '16MB'
+sync_when_write = true
+type = 'LevelDB'
+
+[server.targets.storage_target.meta_store]
+allocate_size = '256MB'
+punch_hole_batch_size = 16
+recycle_batch_size = 256
+removed_chunk_expiration_time = '3day'
+removed_chunk_force_recycled_time = '1h'
--- a/configs/storage_main_app.toml
+++ b/configs/storage_main_app.toml
@@ -0,0 +1,2 @@
+allow_empty_node_id = true
+node_id = 0
--- a/configs/storage_main_launcher.toml
+++ b/configs/storage_main_launcher.toml
@@ -0,0 +1,93 @@
+allow_dev_version = true
+cluster_id = ''
+
+[client]
+default_compression_level = 0
+default_compression_threshold = '128KB'
+default_log_long_running_threshold = '0ns'
+default_report_metrics = false
+default_send_retry_times = 1
+default_timeout = '1s'
+enable_rdma_control = false
+force_use_tcp = false
+
+[client.io_worker]
+num_event_loop = 1
+rdma_connect_timeout = '5s'
+read_write_rdma_in_event_thread = false
+read_write_tcp_in_event_thread = false
+tcp_connect_timeout = '1s'
+wait_to_retry_send = '100ms'
+
+[client.io_worker.connect_concurrency_limiter]
+max_concurrency = 4
+
+[client.io_worker.ibsocket]
+buf_ack_batch = 8
+buf_signal_batch = 8
+buf_size = 16384
+drain_timeout = '5s'
+drop_connections = 0
+event_ack_batch = 128
+max_rd_atomic = 16
+max_rdma_wr = 128
+max_rdma_wr_per_post = 32
+max_sge = 16
+min_rnr_timer = 1
+record_bytes_per_peer = false
+record_latency_per_peer = false
+retry_cnt = 7
+rnr_retry = 0
+send_buf_cnt = 32
+sl = 0
+start_psn = 0
+timeout = 14
+
+[client.io_worker.transport_pool]
+max_connections = 1
+
+[client.processor]
+enable_coroutines_pool = true
+max_coroutines_num = 256
+max_processing_requests_num = 4096
+response_compression_level = 1
+response_compression_threshold = '128KB'
+
+[client.rdma_control]
+max_concurrent_transmission = 64
+
+[client.thread_pool]
+bg_thread_pool_stratetry = 'SHARED_QUEUE'
+collect_stats = false
+enable_work_stealing = false
+io_thread_pool_stratetry = 'SHARED_QUEUE'
+num_bg_threads = 2
+num_connect_threads = 2
+num_io_threads = 2
+num_proc_threads = 2
+proc_thread_pool_stratetry = 'SHARED_QUEUE'
+
+[ib_devices]
+allow_no_usable_devices = false
+allow_unknown_zone = true
+default_network_zone = 'UNKNOWN'
+default_pkey_index = 0
+default_roce_pkey_index = 0
+default_traffic_class = 0
+device_filter = []
+fork_safe = true
+prefer_ibdevice = true
+skip_inactive_ports = true
+skip_unusable_device = true
+subnets = []
+
+[mgmtd_client]
+accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
+auto_extend_client_session_interval = '10s'
+auto_heartbeat_interval = '10s'
+auto_refresh_interval = '10s'
+enable_auto_extend_client_session = false
+enable_auto_heartbeat = true
+enable_auto_refresh = true
+mgmtd_server_addresses = []
+work_queue_size = 100
--- a/deploy/README.md
+++ b/deploy/README.md
@@ -0,0 +1,373 @@
+# 3FS Setup Guide
+
+This section provides a manual deployment guide for setting up a six-node cluster with the cluster ID `stage`.
+
+## Installation prerequisites
+
+### Hardware specifications
+
+| Node     | OS            | IP           | Memory | SSD        | RDMA  |
+|----------|---------------|--------------|--------|------------|-------|
+| meta       | Ubuntu 22.04  | 192.168.1.1  | 128GB  | -          | RoCE  |
+| storage1   | Ubuntu 22.04  | 192.168.1.2  | 512GB  | 14TB × 16  | RoCE  |
+| storage2   | Ubuntu 22.04  | 192.168.1.3  | 512GB  | 14TB × 16  | RoCE  |
+| storage3   | Ubuntu 22.04  | 192.168.1.4  | 512GB  | 14TB × 16  | RoCE  |
+| storage4   | Ubuntu 22.04  | 192.168.1.5  | 512GB  | 14TB × 16  | RoCE  |
+| storage5   | Ubuntu 22.04  | 192.168.1.6  | 512GB  | 14TB × 16  | RoCE  |
+
+> **RDMA Configuration**
+> 1. Assign IP addresses to RDMA NICs. Multiple RDMA NICs (InfiniBand or RoCE) are supported on each node.
+> 2. Check RDMA connectivity between nodes using `ib_write_bw`.
+
+### Third-party dependencies
+
+In production environment, it is recommended to install FoundationDB and ClickHouse on dedicated nodes.
+
+| Service    | Node |
+|------------|-------------------------|
+| [ClickHouse](https://clickhouse.com/docs/install) | meta |
+| [FoundationDB](https://apple.github.io/foundationdb/administration.html) | meta |
+
+> **FoundationDB**
+> 1. Ensure that the version of FoundationDB client matches the server version, or copy the corresponding version of `libfdb_c.so` to maintain compatibility.
+> 2. Find the `fdb.cluster` file and `libfdb_c.so` at `/etc/foundationdb/fdb.cluster`, `/usr/lib/libfdb_c.so` on nodes with FoundationDB installed.
+
+
+---
+## Step 0: Build 3FS
+
+Follow the [instructions](../README.md#build-3fs) to build 3FS. Binaries can be found in `build/bin`.
+
+### Services and clients
+
+The following steps show how to install 3FS services in `/opt/3fs/bin` and the config files in `/opt/3fs/etc`.
+
+| Service    | Binary                  | Config files                                                                | NodeID | Node |
+|------------|-------------------------|-----------------------------------------------------------------------------|--------|---------------|
+| monitor    | monitor_collector_main  | [monitor_collector_main.toml](../configs/monitor_collector_main.toml)       | -      |  meta        |
+| admin_cli  | admin_cli               | [admin_cli.toml](../configs/admin_cli.toml)<br>fdb.cluster                  | -      |  meta<br>storage1<br>storage2<br>storage3<br>storage4<br>storage5 |
+| mgmtd      | mgmtd_main              | [mgmtd_main_launcher.toml](../configs/mgmtd_main_launcher.toml)<br>[mgmtd_main.toml](../configs/mgmtd_main.toml)<br>[mgmtd_main_app.toml](../configs/mgmtd_main_app.toml)<br>fdb.cluster | 1            |  meta        |
+| meta       | meta_main               | [meta_main_launcher.toml](../configs/meta_main_launcher.toml)<br>[meta_main.toml](../configs/meta_main.toml)<br>[meta_main_app.toml](../configs/meta_main_app.toml)<br>fdb.cluster       | 100          |  meta        |
+| storage    | storage_main            | [storage_main_launcher.toml](../configs/storage_main_launcher.toml)<br>[storage_main.toml](../configs/storage_main.toml)<br>[storage_main_app.toml](../configs/storage_main_app.toml)       | 10001~10005  |  storage1<br>storage2<br>storage3<br>storage4<br>storage5 |
+| client     | hf3fs_fuse_main         | [hf3fs_fuse_main_launcher.toml](../configs/hf3fs_fuse_main_launcher.toml)<br>[hf3fs_fuse_main.toml](../configs/hf3fs_fuse_main.toml)                                                     | -            |  meta        |
+
+---
+## Step 1: Create ClickHouse tables for metrics
+
+   Import the SQL file into ClickHouse:
+   ```bash
+   clickhouse-client -n < ~/3fs/deploy/sql/3fs-monitor.sql
+   ```
+
+---
+## Step 2: Monitor service
+
+Install `monitor_collector` service on the **meta** node.
+
+1. Copy `monitor_collector_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`, and create log directory `/var/log/3fs`.
+   ```bash
+   mkdir -p /opt/3fs/{bin,etc}
+   mkdir -p /var/log/3fs
+   cp ~/3fs/build/bin/monitor_collector_main /opt/3fs/bin
+   cp ~/3fs/configs/monitor_collector_main.toml /opt/3fs/etc
+   ```
+2. Update [`monitor_collector_main.toml`](../configs/monitor_collector_main.toml) to add a ClickHouse connection:
+   ```toml
+   [server.monitor_collector.reporter]
+   type = 'clickhouse'
+
+   [server.monitor_collector.reporter.clickhouse]
+   db = '3fs'
+   host = '<CH_HOST>'
+   passwd = '<CH_PASSWD>'
+   port = '<CH_PORT>'
+   user = '<CH_USER>'
+   ```
+3. Start monitor service:
+   ```bash
+   cp ~/3fs/deploy/systemd/monitor_collector_main.service /usr/lib/systemd/system
+   systemctl start monitor_collector_main
+   ```
+
+Note that
+> - Multiple instances of monitor services can be deployed behind a virtual IP address to share the traffic.
+> - Other services communicate with the monitor service over a TCP connection.
+
+---
+## Step 3: Admin client
+Install `admin_cli` on **all** nodes.
+
+1. Copy `admin_cli` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
+   ```bash
+   mkdir -p /opt/3fs/{bin,etc}
+   rsync -avz meta:~/3fs/build/bin/admin_cli /opt/3fs/bin
+   rsync -avz meta:~/3fs/configs/admin_cli.toml /opt/3fs/etc
+   rsync -avz meta:/etc/foundationdb/fdb.cluster /opt/3fs/etc
+   ```
+2. Update [`admin_cli.toml`](../configs/admin_cli.toml) to set `cluster_id` and `clusterFile`:
+   ```toml
+   cluster_id = "stage"
+
+   [fdb]
+   clusterFile = '/opt/3fs/etc/fdb.cluster'
+   ```
+
+The full help documentation for `admin_cli` can be displayed by running the following command:
+
+```bash
+/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml help
+```
+
+---
+## Step 4: Mgmtd service
+Install `mgmtd` service on **meta** node.
+
+1. Copy `mgmtd_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
+   ```bash
+   cp ~/3fs/build/bin/mgmtd_main /opt/3fs/bin
+   cp ~/3fs/configs/{mgmtd_main.toml,mgmtd_main_launcher.toml,mgmtd_main_app.toml} /opt/3fs/etc
+   ```
+2. Update config files:
+   - Set mgmtd `node_id = 1` in [`mgmtd_main_app.toml`](../configs/mgmtd_main_app.toml).
+   - Edit [`mgmtd_main_launcher.toml`](../configs/mgmtd_main_launcher.toml) to set the `cluster_id` and `clusterFile`:
+   ```toml
+   cluster_id = "stage"
+
+   [fdb]
+   clusterFile = '/opt/3fs/etc/fdb.cluster'
+   ```
+   - Set monitor address in [`mgmtd_main.toml`](../configs/mgmtd_main.toml):
+   ```toml
+   [common.monitor.reporters.monitor_collector]
+   remote_ip = "192.168.1.1:10000"
+   ```
+3. Initialize the cluster:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml "init-cluster --mgmtd /opt/3fs/etc/mgmtd_main.toml 1 1048576 16"
+   ```
+
+   The parameters of `admin_cli`:
+   > - `1` the chain table ID
+   > - `1048576` the chunk size in bytes
+   > - `16` the file strip size
+
+   Run `help init-cluster` for full documentation.
+4. Start mgmtd service:
+   ```bash
+   cp ~/3fs/deploy/systemd/mgmtd_main.service /usr/lib/systemd/system
+   systemctl start mgmtd_main
+   ```
+5. Run `list-nodes` command to check if the cluster has been successfully initialized:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
+   ```
+
+If multiple instances of `mgmtd` services deployed, one of the `mgmtd` services is elected as the primary; others are secondaries. Automatic failover occurs when the primary fails.
+
+---
+## Step 5: Meta service
+Install `meta` service on **meta** node.
+1. Copy `meta_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
+   ```bash
+   cp ~/3fs/build/bin/meta_main /opt/3fs/bin
+   cp ~/3fs/configs/{meta_main_launcher.toml,meta_main.toml,meta_main_app.toml} /opt/3fs/etc
+   ```
+2. Update config files:
+   - Set meta `node_id = 100` in [`meta_main_app.toml`](../configs/meta_main_app.toml).
+   - Set `cluster_id`, `clusterFile` and mgmtd address in [`meta_main_launcher.toml`](../configs/meta_main_launcher.toml):
+   ```toml
+   cluster_id = "stage"
+
+   [mgmtd_client]
+   mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
+   ```
+   - Set mgmtd and monitor addresses in [`meta_main.toml`](../configs/meta_main.toml).
+   ```toml
+   [server.mgmtd_client]
+   mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
+
+   [common.monitor.reporters.monitor_collector]
+   remote_ip = "192.168.1.1:10000"
+
+   [server.fdb]
+   clusterFile = '/opt/3fs/etc/fdb.cluster'
+   ```
+3. Config file of meta service is managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type META --file /opt/3fs/etc/meta_main.toml"
+   ```
+4. Start meta service:
+   ```bash
+   cp ~/3fs/deploy/systemd/meta_main.service /usr/lib/systemd/system
+   systemctl start meta_main
+   ```
+5. Run `list-nodes` command to check if meta service has joined the cluster:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
+   ```
+
+If multiple instances of `meta` services deployed, meta requests will be evenly distributed to all instances.
+
+---
+## Step 6: Storage service
+Install `storage` service on **storage** node.
+1. Format the attached 16 SSDs as XFS and mount at `/storage/data{1..16}`, then create data directories `/storage/data{1..16}/3fs` and log directory `/var/log/3fs`.
+   ```bash
+   mkdir -p /storage/data{1..16}
+   mkdir -p /var/log/3fs
+   for i in {1..16};do mkfs.xfs -L data${i} /dev/nvme${i}n1;mount -o noatime,nodiratime -L data${i} /storage/data${i};done
+   mkdir -p /storage/data{1..16}/3fs
+   ```
+2. Increase the max number of asynchronous aio requests:
+   ```bash
+   sysctl -w fs.aio-max-nr=67108864
+   ```
+3. Copy `storage_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
+   ```bash
+   rsync -avz meta:~/3fs/build/bin/storage_main /opt/3fs/bin
+   rsync -avz meta:~/3fs/configs/{storage_main_launcher.toml,storage_main.toml,storage_main_app.toml} /opt/3fs/etc
+   ```
+4. Update config files:
+   - Set `node_id` in [`storage_main_app.toml`](../configs/storage_main_app.toml). Each storage service is assigned a unique id between `10001` and `10005`.
+   - Set `cluster_id` and mgmtd address in [`storage_main_launcher.toml`](../configs/storage_main_launcher.toml).
+   ```toml
+   cluster_id = "stage"
+
+   [mgmtd_client]
+   mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
+   ```
+   - Add target paths in [`storage_main.toml`](../configs/storage_main.toml):
+   ```toml
+   [server.mgmtd]
+   mgmtd_server_address = ["RDMA://192.168.1.1:8000"]
+
+   [common.monitor.reporters.monitor_collector]
+   remote_ip = "192.168.1.1:10000"
+
+   [server.targets]
+   target_paths = ["/storage/data1/3fs","/storage/data2/3fs","/storage/data3/3fs","/storage/data4/3fs","/storage/data5/3fs","/storage/data6/3fs","/storage/data7/3fs","/storage/data8/3fs","/storage/data9/3fs","/storage/data10/3fs","/storage/data11/3fs","/storage/data12/3fs","/storage/data13/3fs","/storage/data14/3fs","/storage/data15/3fs","/storage/data16/3fs",]
+   ```
+5. Config file of storage service is managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type STORAGE --file /opt/3fs/etc/storage_main.toml"
+   ```
+6. Start storage service:
+   ```bash
+   rsync -avz meta:~/3fs/deploy/systemd/storage_main.service /usr/lib/systemd/system
+   systemctl start storage_main
+   ```
+7. Run `list-nodes` command to check if storage service has joined the cluster:
+   ```
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
+   ```
+
+---
+## Step 7: Create admin user, storage targets and chain table
+1. Create an admin user:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "user-add --root --admin 0 root"
+   ```
+   Save the admin token to `/opt/3fs/etc/token.txt`.
+2. Generate `admin_cli` commands to create storage targets on 5 storage nodes (16 SSD per node, 6 targets per SSD).
+   - Follow instructions at [here](data_placement/README.md) to install Python packages.
+   ```bash
+   python ~/3fs/deploy/data_placement/src/model/data_placement.py \
+      -ql -relax -type CR --num_nodes 5 --replication_factor 3 --min_targets_per_disk 6
+   python ~/3fs/deploy/data_placement/src/setup/gen_chain_table.py \
+      --chain_table_type CR --node_id_begin 10001 --node_id_end 10005 \
+      --num_disks_per_node 16 --num_targets_per_disk 6 \
+      --target_id_prefix 1 --chain_id_prefix 9 \
+      --incidence_matrix_path output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1/incidence_matrix.pickle
+   ```
+   The following 3 files will be generated in `output` directory: `create_target_cmd.txt`, `generated_chains.csv`, and `generated_chain_table.csv`.
+3. Create storage targets:
+   ```bash
+   /opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") < output/create_target_cmd.txt
+   ```
+4. Upload chains to mgmtd service:
+   ```bash
+   /opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") "upload-chains output/generated_chains.csv"
+   ```
+5. Upload chain table to mgmtd service:
+    ```bash
+    /opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") "upload-chain-table --desc stage 1 output/generated_chain_table.csv"
+    ```
+6. List chains and chain tables to check if they have been correctly uploaded:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-chains"
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-chain-tables"
+   ```
+---
+## Step 8: FUSE client
+For simplicity FUSE client is deployed on the **meta** node in this guide. However, we strongly advise against deploying clients on service nodes in production environment.
+
+1. Copy `hf3fs_fuse_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
+   ```bash
+   cp ~/3fs/build/bin/hf3fs_fuse_main /opt/3fs/bin
+   cp ~/3fs/configs/{hf3fs_fuse_main_launcher.toml,hf3fs_fuse_main.toml,hf3fs_fuse_main_app.toml} /opt/3fs/etc
+   ```
+2. Create the mount point:
+   ```bash
+   mkdir -p /3fs/stage
+   ```
+3. Set cluster ID, mountpoint, token file and mgmtd address in [`hf3fs_fuse_main_launcher.toml`](../configs/hf3fs_fuse_main_launcher.toml)
+   ```toml
+   cluster_id = "stage"
+   mountpoint = '/3fs/stage'
+   token_file = '/opt/3fs/etc/token.txt'
+
+   [mgmtd_client]
+   mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
+   ```
+4. Set mgmtd and monitor address in [`hf3fs_fuse_main.toml`](../configs/hf3fs_fuse_main.toml).
+   ```toml
+   [mgmtd]
+   mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
+
+   [common.monitor.reporters.monitor_collector]
+   remote_ip = "192.168.1.1:10000"
+   ```
+5. Config file of FUSE client is also managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
+   ```bash
+   /opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type FUSE --file /opt/3fs/etc/hf3fs_fuse_main.toml"
+   ```
+6. Start FUSE client:
+   ```bash
+   cp ~/3fs/deploy/systemd/hf3fs_fuse_main.service /usr/lib/systemd/system
+   systemctl start hf3fs_fuse_main
+   ```
+7. Check if 3FS has been mounted at `/3fs/stage`:
+   ```bash
+   mount | grep '/3fs/stage'
+   ```
+
+## FAQ
+<details>
+  <summary>How to troubleshoot <code>admin_cli init-cluster</code> error?</summary>
+
+  If mgmtd fails to start after running `init-cluster`, the most likely cause is an error in `mgmtd_main.toml`. Any changes to this file require clearing all FoundationDB data and re-running `init-cluster`
+</details>
+
+---
+<details>
+  <summary>How to build a single-node cluster?</summary>
+
+  A minimum of two storage services is required for data replication. If set `--num-nodes=1`, the `gen_chain_table.py` script will fail. In a test environment, this limitation can be bypassed by deploying multiple storage services on a single machine.
+</details>
+
+---
+<details>
+  <summary>How to update config files?</summary>
+
+  All config files are managed by mgmtd. If any `*_main.toml` is updated, such as `storage_main.toml`, the modified file should be uploaded using `admin_cli set-config`.
+</details>
+
+---
+<details>
+  <summary>How to troubleshoot common deployment issues?</summary>
+
+  When encountering any error during deployment,
+  - Check the log messages in `stdout/stderr` using `journalctl`, especially during service startup.
+  - Check log files stored in `/var/log/3fs/` on service and client nodes.
+  - Ensure that the directory `/var/log/3fs/` exists before starting any service.
+</details>
--- a/deploy/data_placement/.gitignore
+++ b/deploy/data_placement/.gitignore
@@ -0,0 +1,17 @@
+__pycache__
+.ipynb_checkpoints
+.tmp/
+dist/
+build/
+output/
+*.egg-info/
+test/scratch/
+test/runtime/
+*.log
+*.pyc
+*.xml
+.tmp/
+.idea
+.coverage
+.vscode/
+.hypothesis/
--- a/deploy/data_placement/README.md
+++ b/deploy/data_placement/README.md
@@ -0,0 +1,60 @@
+# How to generate chain tables
+
+Suppose we are going to setup a small 3FS cluster:
+- 3 replicas for each chunk
+- 5 storage nodes: `10001 ... 10005`
+- 16 SSDs attached to each node
+- 6 storage targets on each SSD
+
+First generate a solution of the data placement problem.
+
+```bash
+$ python src/model/data_placement.py -ql -relax -type CR --num_nodes 5 --replication_factor 3 --min_targets_per_disk 6 --init_timelimit 600
+
+...
+
+2025-02-24 14:25:13.623 | SUCCESS  | __main__:solve:165 - optimal solution: 
+- Status: ok
+  Termination condition: optimal
+  Termination message: TerminationCondition.optimal
+
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 1,2: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 1,3: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 1,4: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 1,5: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 2,1: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 2,3: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 2,4: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 2,5: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 3,1: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 3,2: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 3,4: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 3,5: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 4,1: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 4,2: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 4,3: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 4,5: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 5,1: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 5,2: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 5,3: 1.5
+2025-02-24 14:25:13.624 | DEBUG    | __main__:check_solution:322 - 5,4: 1.5
+2025-02-24 14:25:13.624 | INFO     | __main__:check_solution:331 - min_peer_traffic=1.5 max_peer_traffic=1.5
+2025-02-24 14:25:13.624 | INFO     | __main__:check_solution:332 - total_traffic=30.0 max_total_traffic=30
+2025-02-24 14:25:14.147 | SUCCESS  | __main__:run:147 - saved solution to: output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1
+```
+
+Note that some combinations of `--num_nodes` and `--replication_factor` may have no solution.
+
+Then generate commands to create/remove storage targets.
+
+```bash
+$ python src/setup/gen_chain_table.py --chain_table_type CR --node_id_begin 10001 --node_id_end 10005 --num_disks_per_node 16 --num_targets_per_disk 6 --incidence_matrix_path output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1/incidence_matrix.pickle
+
+$ ls -1 output/
+DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1
+appsi_highs.log
+create_target_cmd.txt
+generated_chain_table.csv
+generated_chains.csv
+remove_target_cmd.txt
+```
--- a/deploy/data_placement/requirements.txt
+++ b/deploy/data_placement/requirements.txt
@@ -0,0 +1,12 @@
+psutil
+pandas
+plotly
+loguru
+highspy==1.8.0
+pyomo==6.8.0
+coverage~=7.4.4
+pytest==8.2.1
+pytest-cov==5.0.0
+pytest-forked==1.6.0
+pytest-xdist==3.6.1
+pytest-timeout==2.3.1
--- a/deploy/data_placement/src/init.py
+++ b/deploy/data_placement/src/init.py
--- a/deploy/data_placement/src/model/data_placement.py
+++ b/deploy/data_placement/src/model/data_placement.py
@@ -0,0 +1,549 @@
+import math
+import pickle
+import random
+import time
+import psutil
+import os.path
+import pandas as pd
+import pyomo.environ as po
+import plotly.express as px
+from typing import Dict, Generator, Literal, Tuple
+from loguru import logger
+from pyomo.opt import SolverStatus, TerminationCondition
+
+
+class InfeasibleModel(Exception):
+  pass
+
+class SolverTimeout(Exception):
+  pass
+
+class SolverError(Exception):
+  pass
+
+class InvalidSolution(Exception):
+  pass
+
+
+class DataPlacementModel(object):
+
+  def __init__(self, chain_table_type: Literal["EC", "CR"], num_nodes, group_size, num_groups=None, num_targets_per_disk=None, min_targets_per_disk=1, bibd_only=False, qlinearize=False, relax_lb=1, relax_ub=0):
+    if num_targets_per_disk is None:
+      num_nodes, num_groups, num_targets_per_disk, group_size = DataPlacementModel.find_params(num_nodes, group_size, min_r=min_targets_per_disk, bibd_only=bibd_only)
+    self.chain_table_type = chain_table_type
+    self.num_nodes = num_nodes
+    self.group_size = group_size
+    self.num_targets_per_disk = num_targets_per_disk
+    self.num_groups = num_groups or self.num_targets_total // self.group_size
+    self.bibd_only = bibd_only
+    self.qlinearize = qlinearize
+    self.relax_lb = relax_lb
+    self.relax_ub = relax_ub
+
+  def __repr__(self):
+    v, b, r, k, λ = self.v, self.b, self.r, self.k, self.λ
+    lb, ub = self.relax_lb, self.relax_ub
+    return f"{self.__class__.__name__}-{v=},{b=},{r=},{k=},{λ=},{lb=},{ub=}"
+
+  __str__ = __repr__
+
+  @property
+  def path_name(self):
+    return str(self).translate(str.maketrans(' ,:=', '---_'))
+
+  @property
+  def v(self):
+    return self.num_nodes
+
+  @property
+  def b(self):
+    return self.num_groups
+
+  @property
+  def r(self):
+    return self.num_targets_per_disk
+
+  @property
+  def k(self):
+    return self.group_size
+
+  @property
+  def λ(self):
+    return self.max_recovery_traffic_on_peer
+
+  @property
+  def num_targets_used(self):
+    return self.num_groups * self.group_size
+
+  @property
+  def num_targets_total(self):
+    return self.num_nodes * self.num_targets_per_disk
+
+  @property
+  def all_targets_used(self):
+    return self.num_targets_used == self.num_targets_total
+
+  @property
+  def balanced_peer_traffic(self):
+    return self.all_targets_used and self.sum_recovery_traffic_per_failure % (self.num_nodes-1) == 0
+
+  @property
+  def recovery_traffic_factor(self):
+    return (self.group_size - 1) if self.chain_table_type == "EC" else 1
+
+  @property
+  def sum_recovery_traffic_per_failure(self):
+    return self.num_targets_per_disk * self.recovery_traffic_factor
+
+  @property
+  def max_recovery_traffic_on_peer(self):
+    return math.ceil(self.sum_recovery_traffic_per_failure / (self.num_nodes-1))
+
+  @property
+  def balanced_incomplete_block_design(self):
+    return self.bibd_only and self.balanced_peer_traffic and self.relax_ub == 0
+
+  @staticmethod
+  def find_params(v, k, min_r=1, max_r=100, bibd_only=False):
+    if bibd_only: min_r = max(min_r, k)
+    for r in range(min_r, max_r):
+      if v * r % k == 0 and r * (k - 1) >= v - 1:
+        b = v * r // k
+        if not bibd_only or r * (k - 1) % (v - 1) == 0:
+          return v, b, r, k
+    raise ValueError(f"cannot find valid params: {v=}, {k=}")
+
+  def run(self, pyomo_solver=None, threads=psutil.cpu_count(logical=False), init_timelimit=1800, max_timelimit=3600*2, auto_relax=False, output_root="output", verbose=False, add_elapsed_time=None):
+    init_relax_lb = self.relax_lb
+    init_relax_ub = self.relax_ub
+    timelimit = 0
+    num_loops = self.max_recovery_traffic_on_peer*2
+    os.makedirs(output_root, exist_ok=True)
+
+    for loop in range(num_loops):
+      try:
+        logger.info(f"solving model with {pyomo_solver} #{loop}: {self}")
+        if add_elapsed_time is not None:
+          add_elapsed_time()
+        timelimit = min(timelimit + init_timelimit, max_timelimit)
+        instance = self.solve(pyomo_solver, threads, timelimit, output_root, verbose)
+        if add_elapsed_time is not None:
+          add_elapsed_time(f"solve model time (lb={self.relax_lb}, ub={self.relax_ub})")
+      except (InfeasibleModel, SolverTimeout) as ex:
+        logger.error(f"cannot find solution for current params: {ex}")
+        if auto_relax:
+          self.relax_lb = init_relax_lb + (loop+1) // 2
+          self.relax_ub = init_relax_ub + (loop+2) // 2
+          continue
+        elif loop + 1 < num_loops:
+          logger.critical(f"failed to find solution after {num_loops} attempts")
+          raise ex
+        else:
+          raise ex
+      else:
+        output_path = os.path.join(output_root, self.path_name)
+        os.makedirs(output_path, exist_ok=True)
+        self.save_solution(instance, output_path)
+        self.visualize_solution(instance, output_path)
+        logger.success(f"saved solution to: {output_path}")
+        return instance
+
+  logger.catch(reraise=True, message="failed to solve model")
+  def solve(self, pyomo_solver=None, threads=psutil.cpu_count(logical=False), timelimit=3600, output_path="output", verbose=False):
+    if "highs" in pyomo_solver:
+      self.qlinearize = True
+
+    instance = self.build_model()
+    if verbose: instance.pprint()
+
+    try:
+      results = self.solve_model(instance, pyomo_solver, threads, timelimit, output_path)
+    except RuntimeError as ex:
+      raise SolverError("unknown runtime error") from ex
+
+    if (results.solver.status == SolverStatus.ok) and (results.solver.termination_condition == TerminationCondition.optimal):
+        logger.success(f"optimal solution: {str(results.solver)}")
+        if pyomo_solver is not None: instance.solutions.load_from(results)
+    elif results.solver.termination_condition == TerminationCondition.infeasible:
+        raise InfeasibleModel(f"infeasible: {str(results.solver)}")
+    elif results.solver.termination_condition in (TerminationCondition.maxTimeLimit, TerminationCondition.maxIterations):
+        raise SolverTimeout(f"timeout: {str(results.solver)}")
+    else:
+        raise SolverError(f"error: {str(results.solver)}")
+
+    if verbose: self.print_solution(instance)
+    try:
+      self.check_solution(instance)
+    except AssertionError as ex:
+      raise InvalidSolution from ex
+    return instance
+
+  def build_model(self):
+    logger.info(f"{self.num_nodes=} {self.num_targets_per_disk=} {self.group_size=} {self.num_groups=} {self.qlinearize=} {self.relax_lb=} {self.relax_ub=}")
+    # v >= k
+    assert self.num_nodes >= self.group_size, f"{self.num_nodes=} < {self.group_size=}"
+    # Fisher's inequality
+    if self.balanced_incomplete_block_design:
+      # b >= v
+      assert self.num_groups >= self.num_nodes, f"{self.num_groups=} < {self.num_nodes=}"
+      # r >= k
+      assert self.num_targets_per_disk >= self.group_size, f"{self.num_targets_per_disk=} < {self.group_size=}"
+
+    logger.info(f"{self.sum_recovery_traffic_per_failure=} {self.max_recovery_traffic_on_peer=}")
+    if self.sum_recovery_traffic_per_failure < self.num_nodes - 1:
+      logger.warning(f"some disks do not share recovery traffic: {self.sum_recovery_traffic_per_failure=} < {self.num_nodes=} - 1")
+
+    logger.info(f"{self.all_targets_used=} {self.balanced_peer_traffic=}")
+    logger.info(f"{self.num_targets_used=} {self.num_targets_total=}")
+    if self.num_targets_used < self.num_targets_total:
+      logger.warning(f"some disks have unused targets: {self.num_targets_used=} < {self.num_targets_total=}")
+    else:
+      assert self.num_targets_used == self.num_targets_total, f"{self.num_targets_used=} > {self.num_targets_total=}"
+
+    model = po.ConcreteModel()
+    # index sets
+    model.disks = po.RangeSet(1, self.num_nodes)
+    model.target_idxs = po.RangeSet(1, self.num_targets_per_disk)
+    model.targets = model.disks * model.target_idxs
+    model.groups = po.RangeSet(1, self.num_groups)
+
+    def disk_pairs_init(model):
+      for disk in model.disks:
+        for peer in model.disks:
+          if peer > disk:
+            yield (disk, peer)
+    model.disk_pairs = po.Set(dimen=2, initialize=disk_pairs_init)
+
+    # variables
+
+    model.disk_used_by_group = po.Var(model.disks, model.groups, domain=po.Binary)
+    if self.qlinearize:
+      model.disk_in_same_group = po.Var(model.disk_pairs, model.groups, domain=po.Binary)
+
+    # constraints
+
+    def calc_disk_in_same_group(model, disk, peer, group):
+      return model.disk_used_by_group[disk,group] * model.disk_used_by_group[peer,group]
+
+    def define_disk_in_same_group_lower_bound(model, disk, peer, group):
+      return model.disk_used_by_group[disk,group] + model.disk_used_by_group[peer,group] <= model.disk_in_same_group[disk,peer,group] + 1
+
+    def define_disk_in_same_group_upper_bound1(model, disk, peer, group):
+      return model.disk_in_same_group[disk,peer,group] <= model.disk_used_by_group[disk,group]
+
+    def define_disk_in_same_group_upper_bound2(model, disk, peer, group):
+      return model.disk_in_same_group[disk,peer,group] <= model.disk_used_by_group[peer,group]
+
+    if self.qlinearize:
+      model.define_disk_in_same_group_lower_bound_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_lower_bound)
+      model.define_disk_in_same_group_upper_bound1_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_upper_bound1)
+      model.define_disk_in_same_group_upper_bound2_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_upper_bound2)
+
+    def each_disk_has_limited_capcity(model, disk):
+      if self.all_targets_used:
+        return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups) == self.num_targets_per_disk
+      else:
+        return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups) <= self.num_targets_per_disk
+    model.each_disk_has_limited_capcity_eqn = po.Constraint(model.disks, rule=each_disk_has_limited_capcity)
+
+    def enough_disks_assigned_to_each_group(model, group):
+      return po.quicksum(model.disk_used_by_group[disk,group] for disk in model.disks) == self.group_size
+    model.enough_disks_assigned_to_each_group_eqn = po.Constraint(model.groups, rule=enough_disks_assigned_to_each_group)
+
+    def calc_peer_recovery_traffic(model, disk, peer):
+      if self.qlinearize:
+        return po.quicksum(model.disk_in_same_group[disk,peer,group] for group in model.groups)
+      else:
+        return po.quicksum(calc_disk_in_same_group(model, disk, peer, group) for group in model.groups)
+
+    def peer_recovery_traffic_upper_bound(model, disk, peer):
+      if self.balanced_incomplete_block_design:
+        return calc_peer_recovery_traffic(model, disk, peer) == self.max_recovery_traffic_on_peer
+      else:
+        return calc_peer_recovery_traffic(model, disk, peer) <= self.max_recovery_traffic_on_peer + self.relax_ub
+    model.peer_recovery_traffic_upper_bound_eqn = po.Constraint(model.disk_pairs, rule=peer_recovery_traffic_upper_bound)
+
+    def peer_recovery_traffic_lower_bound(model, disk, peer):
+      return calc_peer_recovery_traffic(model, disk, peer) >= max(0, self.max_recovery_traffic_on_peer - self.relax_lb)
+
+    if self.balanced_incomplete_block_design:
+      logger.info(f"lower bound not needed for balanced incomplete block design (BIBD)")
+    elif self.all_targets_used:
+      logger.info(f"lower bound imposed on peer traffic: {self.relax_lb=} {self.qlinearize=} {self.all_targets_used=}")
+      model.peer_recovery_traffic_lower_bound_eqn = po.Constraint(model.disk_pairs, rule=peer_recovery_traffic_lower_bound)
+    else:
+      logger.info(f"lower bound not imposed on peer traffic: {self.relax_lb=} {self.qlinearize=} {self.all_targets_used=}")
+
+    def total_recovery_traffic(model):
+      return po.summation(model.disk_in_same_group) * 2
+
+    # model.obj = po.Objective(rule=total_recovery_traffic, sense=po.minimize)
+    model.obj = po.Objective(expr=1)  # dummy objective
+    return model
+
+  def solve_model(self, instance, pyomo_solver, threads, timelimit, output_path):
+    if pyomo_solver is not None:
+      solver = po.SolverFactory(pyomo_solver)
+      return solver.solve(instance, options={"threads": str(threads), "log_file": os.path.join(output_path, f"{pyomo_solver}.log")}, load_solutions=False, timelimit=timelimit, tee=True)
+    else:
+      raise ValueError(f"no solver specified")
+
+  def get_peer_traffic(self, instance) -> Dict[Tuple[int,int], int]:
+    peer_traffic_map = {}
+    for disk in instance.disks:
+      for peer in instance.disks:
+        if disk == peer: continue
+        peer_traffic_map[(disk, peer)] = sum(
+          po.value(instance.disk_used_by_group[disk,group]) *
+          po.value(instance.disk_used_by_group[peer,group])
+          for group in instance.groups) * self.recovery_traffic_factor / (self.group_size - 1)
+    return peer_traffic_map
+
+  def get_incidence_matrix(self, instance) -> Dict[Tuple[int, int], bool]:
+    incidence_matrix = {}
+    for disk in instance.disks:
+      for group in instance.groups:
+        val = instance.disk_used_by_group[disk,group]
+        if math.isclose(po.value(val), 1):
+          incidence_matrix[(disk,group)] = True
+    if self.all_targets_used:
+      assert len(incidence_matrix) % self.num_nodes == 0, f"{len(incidence_matrix)=} % {self.num_nodes=}"
+    assert len(incidence_matrix) % self.num_groups == 0, f"{len(incidence_matrix)=} % {self.num_groups=}"
+    return incidence_matrix
+
+  def check_solution(self, instance):
+    has_peer_traffic_lower_bound = False
+    for c in instance.component_objects(po.Constraint):
+      if "peer_recovery_traffic_lower_bound_eqn" in str(c):
+        has_peer_traffic_lower_bound = True
+
+    peer_traffic_map = self.get_peer_traffic(instance)
+    for (disk, peer), peer_traffic in peer_traffic_map.items():
+      logger.debug(f"{disk},{peer}: {peer_traffic:.1f}")
+      assert peer_traffic <= self.max_recovery_traffic_on_peer + self.relax_ub + 1e-5, f"{peer_traffic=} > {self.max_recovery_traffic_on_peer=} + {self.relax_ub}"
+      if has_peer_traffic_lower_bound:
+        assert peer_traffic >= max(0, self.max_recovery_traffic_on_peer - self.relax_lb) - 1e-5, f"{peer_traffic=} < {self.max_recovery_traffic_on_peer=} - {self.relax_lb}"
+
+    min_peer_traffic = min(peer_traffic_map.values())
+    max_peer_traffic = max(peer_traffic_map.values())
+    total_traffic = sum(peer_traffic_map.values())
+    max_total_traffic = self.num_nodes * self.sum_recovery_traffic_per_failure
+    logger.info(f"{min_peer_traffic=:.1f} {max_peer_traffic=:.1f}")
+    logger.info(f"{total_traffic=} {max_total_traffic=}")
+
+    peer_traffic_diff = max_peer_traffic - min_peer_traffic
+    if has_peer_traffic_lower_bound:
+      assert peer_traffic_diff <= self.relax_ub + self.relax_lb + 1e-5, f"{peer_traffic_diff=}"
+    if self.balanced_incomplete_block_design:
+      assert math.isclose(peer_traffic_diff, 0.0, abs_tol=1e-9), f"{peer_traffic_diff=}"
+
+    assert total_traffic <= max_total_traffic + 1e-5
+    return total_traffic, min_peer_traffic, max_peer_traffic
+
+  def print_solution(self, instance):
+    for disk in instance.disks:
+      for group in instance.groups:
+        val = instance.disk_used_by_group[disk,group]
+        if math.isclose(po.value(val), 1):
+          logger.info(f"{val}: {po.value(val)}")
+
+  def save_solution(self, instance, output_path: str="output"):
+    incidence_matrix = self.get_incidence_matrix(instance)
+    with open(os.path.join(output_path, "incidence_matrix.pickle"), "wb") as fout:
+      pickle.dump(incidence_matrix, fout)
+
+    peer_traffic_map = self.get_peer_traffic(instance)
+    with open(os.path.join(output_path, "peer_traffic_map.pickle"), "wb") as fout:
+      pickle.dump(peer_traffic_map, fout)
+
+  def visualize_solution(self, instance, output_path: str="output", write_html=True):
+    incidence_matrix = self.get_incidence_matrix(instance)
+    disks, groups = zip(*incidence_matrix.keys())
+    incidence_df = pd.DataFrame(zip(disks, groups), columns=["disk", "group"])
+
+    peer_traffic_map = self.get_peer_traffic(instance)
+    min_peer_traffic = min(peer_traffic_map.values())
+    max_peer_traffic = max(peer_traffic_map.values())
+
+    fig = px.scatter(
+      incidence_df,
+      x="disk",
+      y="group",
+      title=f"{self}, min/max peer traffic: {min_peer_traffic:.1f}/{max_peer_traffic:.1f}")
+    fig.update_layout(
+      xaxis_title="Nodes",
+      yaxis_title="Groups",
+      xaxis = dict(
+        tickmode = 'array',
+        tickvals = list(range(1, self.num_nodes+1)),
+      ),
+      yaxis = dict(
+        tickmode = 'array',
+        tickvals = list(range(1, self.num_groups+1)),
+      ),
+    )
+
+    if write_html:
+      fig.write_html(os.path.join(output_path, f"data_placement.html"), include_plotlyjs=True)
+    return fig
+
+
+class RebalanceTrafficModel(DataPlacementModel):
+
+  def __init__(self, existing_incidence_matrix, chain_table_type: Literal["EC", "CR"], num_nodes, group_size, num_groups=None, num_targets_per_disk=None, min_targets_per_disk=1, bibd_only=False, qlinearize=False, relax_lb=1, relax_ub=0):
+    self.existing_incidence_matrix = existing_incidence_matrix
+    self.existing_disks, self.existing_groups = zip(*existing_incidence_matrix.keys())
+    num_existing_targets_per_disk = math.ceil(self.total_existing_targets / self.num_existing_disk)
+    min_targets_per_disk = max(min_targets_per_disk, num_existing_targets_per_disk)
+    if num_targets_per_disk is None:
+      num_nodes, num_groups, num_targets_per_disk, group_size = DataPlacementModel.find_params(num_nodes, group_size, min_r=min_targets_per_disk, bibd_only=bibd_only)
+    else:
+      assert num_targets_per_disk >= min_targets_per_disk
+    super().__init__(chain_table_type, num_nodes, group_size, num_groups, num_targets_per_disk, min_targets_per_disk, bibd_only, qlinearize, relax_lb, relax_ub)
+
+  @property
+  def num_existing_disk(self):
+    return max(self.existing_disks)
+
+  @property
+  def num_existing_groups(self):
+    return max(self.existing_groups)
+
+  @property
+  def total_existing_targets(self):
+    return len(self.existing_disks)
+
+  @property
+  def existing_group_size(self):
+    assert self.total_existing_targets % self.num_existing_groups == 0, f"{self.total_existing_targets=} % {self.num_existing_groups=}"
+    return self.total_existing_targets // self.num_existing_groups
+
+  def build_model(self):
+    max_existing_targets_per_disk = math.ceil(self.total_existing_targets / self.num_nodes)
+    logger.info(f"{self.num_existing_disk=} {self.num_existing_groups=} {self.total_existing_targets=} {max_existing_targets_per_disk=}")
+
+    assert self.num_nodes >= self.num_existing_disk, f"{self.num_nodes=} < {self.num_existing_disk=}"
+    assert self.num_groups >= self.num_existing_groups, f"{self.num_groups=} < {self.num_existing_groups=}"
+    assert self.group_size == self.existing_group_size, f"{self.group_size=} != {self.existing_group_size=}"
+    assert self.num_targets_per_disk >= max_existing_targets_per_disk, f"{self.num_targets_per_disk=} >= {max_existing_targets_per_disk=}"
+
+    model = super().build_model()
+
+    def existing_targets_evenly_distributed_to_disks(model, disk):
+      return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups if group <= self.num_existing_groups) <= max_existing_targets_per_disk
+    model.existing_targets_evenly_distributed_to_disks_eqn = po.Constraint(model.disks, rule=existing_targets_evenly_distributed_to_disks)
+
+    def num_existing_targets_not_moved(model):
+      return po.quicksum(model.disk_used_by_group[disk,group] for disk in model.disks for group in model.groups if (disk,group) in self.existing_incidence_matrix)
+
+    def total_rebalance_traffic(model):
+      return self.total_existing_targets - num_existing_targets_not_moved(model)
+
+    model.obj = po.Objective(expr=total_rebalance_traffic, sense=po.minimize)
+    return model
+
+  def visualize_solution(self, instance, output_path = "output", write_html=True):
+    incidence_matrix = self.get_incidence_matrix(instance)
+    disks, groups = zip(*incidence_matrix.keys())
+    incidence_df = pd.DataFrame(zip(disks, groups, [g > self.num_existing_groups for g in groups]), columns=["disk", "group", "new"])
+
+    peer_traffic_map = self.get_peer_traffic(instance)
+    min_peer_traffic = min(peer_traffic_map.values())
+    max_peer_traffic = max(peer_traffic_map.values())
+
+    fig = px.scatter(
+      incidence_df,
+      x="disk",
+      y="group",
+      color="new",
+      title=f"{self}, min/max peer traffic: {min_peer_traffic:.1f}/{max_peer_traffic:.1f}, rebalance traffic: {po.value(instance.obj.expr)}")
+    fig.update_layout(
+      xaxis_title="Nodes",
+      yaxis_title="Groups",
+      xaxis = dict(
+        tickmode = 'array',
+        tickvals = list(range(1, self.num_nodes+1)),
+      ),
+      yaxis = dict(
+        tickmode = 'array',
+        tickvals = list(range(1, self.num_groups+1)),
+      ),
+    )
+
+    if write_html:
+      fig.write_html(os.path.join(output_path, f"{self.path_name}.html"), include_plotlyjs=True)
+    return fig
+
+
+def main():
+  import psutil
+  import argparse
+
+  parser = argparse.ArgumentParser(prog="model.py", description="3FS data placement")
+  parser.add_argument("-pyomo", "--pyomo_solver", default="appsi_highs", choices=["appsi_highs", "cbc",  "scip"], help="Solver used by Pyomo")
+  parser.add_argument("-type", "--chain_table_type",  type=str, required=True, choices=["CR", "EC"], help="CR - Chain Replication; EC - Erasure Coding")
+  parser.add_argument("-j", "--solver_threads", type=int, default=psutil.cpu_count(logical=False)//2, help="Number of solver threads")
+  parser.add_argument("-v", "--num_nodes", type=int, required=True, help="Number of storage nodes")
+  parser.add_argument("-r", "--num_targets_per_disk", type=int, default=None, help="Number of storage targets on each disk")
+  parser.add_argument("-min_r", "--min_targets_per_disk", type=int, default=1, help="Min number of storage targets on each disk")
+  parser.add_argument("-k", "--replication_factor", "--group_size", dest="group_size", type=int, default=3, help="Replication factor or erasure coding group size")
+  parser.add_argument("-b", "--num_groups", type=int, default=None, help="Number of chains or EC groups")
+  parser.add_argument("-ql", "--qlinearize", action="store_true", help="Enable linearization of quadratic equations")
+  parser.add_argument("-lb", "--relax_lb", type=int, default=1, help="Relax the lower bound of peer recovery traffic")
+  parser.add_argument("-ub", "--relax_ub", type=int, default=0, help="Relax the upper bound of peer recovery traffic")
+  parser.add_argument("-relax", "--auto_relax", action="store_true", help="Auto relax the lower/upper bound of peer recovery traffic when timeout")
+  parser.add_argument("-bibd", "--bibd_only", action="store_true", help="Only create balanced incomplete block design (BIBD)")
+  parser.add_argument("-t", "--init_timelimit", type=int, default=1800, help="Initial timeout for solver")
+  parser.add_argument("-T", "--max_timelimit", type=int, default=3600*2, help="Max timeout for solver")
+  parser.add_argument("-o", "--output_path", default="output", help="Path of output files")
+  parser.add_argument("-m", "--existing_incidence_matrix", default=None, help="Existing incidence matrix for rebalance traffic model")
+  parser.add_argument("-V", "--verbose", action="store_true", help="Show verbose output")
+  args = parser.parse_args()
+
+  if args.existing_incidence_matrix is None:
+    DataPlacementModel(
+      args.chain_table_type,
+      args.num_nodes,
+      args.group_size,
+      args.num_groups,
+      args.num_targets_per_disk,
+      args.min_targets_per_disk,
+      args.bibd_only,
+      args.qlinearize,
+      args.relax_lb,
+      args.relax_ub,
+    ).run(
+      args.pyomo_solver,
+      args.solver_threads,
+      args.init_timelimit,
+      args.max_timelimit,
+      args.auto_relax,
+      args.output_path,
+      args.verbose)
+  else:
+    with open(args.existing_incidence_matrix, "rb") as fin:
+      existing_incidence_matrix = pickle.load(fin)
+    RebalanceTrafficModel(
+      existing_incidence_matrix,
+      args.chain_table_type,
+      args.num_nodes,
+      args.group_size,
+      args.num_groups,
+      args.num_targets_per_disk,
+      args.min_targets_per_disk,
+      args.bibd_only,
+      args.qlinearize,
+      args.relax_lb,
+      args.relax_ub,
+    ).run(
+      args.pyomo_solver,
+      args.solver_threads,
+      args.init_timelimit,
+      args.max_timelimit,
+      args.auto_relax,
+      args.output_path,
+      args.verbose)
+
+
+if __name__ == "__main__":
+  main()
--- a/deploy/data_placement/src/model/data_placement_job.py
+++ b/deploy/data_placement/src/model/data_placement_job.py
@@ -0,0 +1,108 @@
+# local test
+# pytest test/test_plan.py -v -x
+# production setup
+import functools
+import socket
+import sys
+import os.path
+import itertools
+import pandas as pd
+import pyarrow as arrow
+from typing import List, Literal
+from loguru import logger
+from smallpond.common import pytest_running
+from smallpond.logical.dataset import ArrowTableDataSet
+from smallpond.logical.node import Context, ConsolidateNode, DataSetPartitionNode, DataSourceNode, ArrowComputeNode, LogicalPlan, SqlEngineNode
+from smallpond.execution.driver import Driver
+from smallpond.execution.task import RuntimeContext, ArrowComputeTask
+
+
+def solve_model(runtime_task: ArrowComputeTask,
+                chain_table_type, num_nodes, group_size, min_targets_per_disk,
+                init_timelimit, max_timelimit,
+                pyomo_solver="appsi_highs"):
+  import logging
+  pyomo_logger = logging.getLogger('pyomo')
+  pyomo_logger.setLevel(logging.WARNING)
+
+  try:
+    from src.model.data_placement import DataPlacementModel
+  except:
+    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+    from src.model.data_placement import DataPlacementModel
+
+  model = DataPlacementModel(chain_table_type, num_nodes, group_size, min_targets_per_disk=min_targets_per_disk, bibd_only=False, qlinearize=True, relax_lb=1, relax_ub=0)
+  runtime_task.add_elapsed_time("build model time")
+
+  instance = model.run(
+    pyomo_solver=pyomo_solver,
+    threads=runtime_task.cpu_limit,
+    init_timelimit=init_timelimit,
+    max_timelimit=max_timelimit,
+    auto_relax=True,
+    output_root=runtime_task.runtime_output_abspath,
+    add_elapsed_time=runtime_task.add_elapsed_time)
+  return model, instance
+
+def solve_loop(runtime_ctx: RuntimeContext, input_tables: List[arrow.Table],
+               init_timelimit, max_timelimit,
+               pyomo_solver="appsi_highs") -> arrow.Table:
+  runtime_task = runtime_ctx.task
+  model_params, = input_tables
+
+  output_table = None
+  schema = arrow.schema([
+      arrow.field("chain_table_type", arrow.string()),
+      arrow.field("num_nodes", arrow.uint32()),
+      arrow.field("group_size", arrow.uint32()),
+      arrow.field("disks", arrow.list_(arrow.uint32())),
+      arrow.field("groups", arrow.list_(arrow.uint32())),
+  ])
+
+  for chain_table_type, num_nodes, group_size, min_targets_per_disk in zip(*model_params.to_pydict().values()):
+    model, instance = solve_model(runtime_task, chain_table_type, num_nodes, group_size, min_targets_per_disk, init_timelimit, max_timelimit, pyomo_solver)
+    incidence_matrix = model.get_incidence_matrix(instance)
+    disks, groups = zip(*incidence_matrix.keys())
+    sol_table = arrow.Table.from_arrays([[chain_table_type], [num_nodes], [group_size], [disks], [groups]], schema=schema)
+    output_table = sol_table if output_table is None else arrow.concat_tables((output_table, sol_table))
+  return output_table
+
+
+def search_data_placement_plans(
+    chain_table_type: Literal["EC", "CR"],
+    num_nodes: List[int], group_size: List[int], min_targets_per_disk=1,
+    init_timelimit=1800, max_timelimit=3600*3,
+    solver_threads: int=64,
+    pyomo_solver="appsi_highs"):
+  params = pd.DataFrame([(chain_table_type, v, k, min_targets_per_disk)
+                         for v, k in itertools.product(num_nodes, group_size) if v >= k],
+                         columns=["chain_table_type", "num_nodes", "group_size", "min_targets_per_disk"])
+  logger.warning(f"params: {params}")
+
+  ctx = Context()
+  params_source = DataSourceNode(ctx, ArrowTableDataSet(arrow.Table.from_pandas(params)))
+  params_partitions = DataSetPartitionNode(ctx, (params_source,), npartitions=len(params), partition_by_rows=True)
+
+  data_placement_sols = ArrowComputeNode(
+    ctx, (params_partitions,),
+    process_func=functools.partial(solve_loop, init_timelimit=init_timelimit, max_timelimit=max_timelimit, pyomo_solver=pyomo_solver),
+    cpu_limit=solver_threads)
+  return LogicalPlan(ctx, data_placement_sols)
+
+
+def main():
+  driver = Driver()
+  driver.add_argument("-pyomo", "--pyomo_solver", default="appsi_highs", choices=["appsi_highs", "cbc",  "scip"], help="Solver used by Pyomo")
+  driver.add_argument("-type", "--chain_table_type",  type=str, required=True, choices=["EC", "CR"], help="CR - Chain Replication; EC - Erasure Coding")
+  driver.add_argument("-v", "--num_nodes", nargs="+", type=int, required=True, help="Number of storage nodes")
+  driver.add_argument("-k", "--replication_factor", "--group_size", dest="group_size", type=int, default=3, help="Replication factor or erasure coding group size")
+  driver.add_argument("-min_r", "--min_targets_per_disk", type=int, default=1, help="Min number of storage targets on each disk")
+  driver.add_argument("-j", "--solver_threads", type=int, default=32, help="Number of solver threads")
+  driver.add_argument("-t", "--init_timelimit", type=int, default=1800, help="Initial timeout for solver")
+  driver.add_argument("-T", "--max_timelimit", type=int, default=3600*3, help="Max timeout for solver")
+  plan = search_data_placement_plans(num_executors=driver.num_executors, **driver.get_arguments())
+  driver.run(plan)
+
+
+if __name__ == "__main__":
+  main()
--- a/deploy/data_placement/src/setup/init.py
+++ b/deploy/data_placement/src/setup/init.py
--- a/deploy/data_placement/src/setup/gen_chain_table.py
+++ b/deploy/data_placement/src/setup/gen_chain_table.py
@@ -0,0 +1,124 @@
+import argparse
+import os.path
+from collections import Counter, defaultdict, namedtuple
+import pickle
+from typing import Dict, List, Literal, Tuple
+
+
+Target = namedtuple("Target", ["target_id", "node_id", "disk_index"])
+Chain = namedtuple("Chain", ["chain_id", "target_list"])
+
+
+def calc_target_id(target_id_prefix: int, node_id: int, disk_index: int, target_index: int):
+  return ((target_id_prefix * 1_000_000 + node_id) * 1_000 + (disk_index+1)) * 100 + (target_index+1)
+
+
+def generate_chains(
+    chain_table_type: Literal["EC", "CR"],
+    node_id_begin: int,
+    node_id_end: int,
+    num_disks_per_node: int,
+    num_targets_per_disk: int,
+    target_id_prefix: int,
+    chain_id_prefix: int,
+    incidence_matrix: Dict[Tuple[int, int], bool],
+    **kwargs):
+  num_nodes = node_id_end - node_id_begin + 1
+  nodes, groups = zip(*sorted(incidence_matrix.keys()))
+  group_sizes = list(Counter(groups).values())
+  assert max(nodes) == num_nodes, f"{max(nodes)=} != {num_nodes=}"
+  assert all(s == group_sizes[0] for s in group_sizes[1:]), f"not all group sizes the same: {group_sizes}"
+  assert len(incidence_matrix) % group_sizes[0] == 0, f"{len(incidence_matrix)=} % {group_sizes[0]=} != 0"
+  assert len(incidence_matrix) == num_nodes * num_targets_per_disk, f"{len(incidence_matrix)=} != {num_nodes=} * {num_targets_per_disk=}"
+
+  global_target_list = []
+  chain_target_list = defaultdict(list)
+
+  for disk_index in range(num_disks_per_node):
+    group_slot_idx = defaultdict(int)
+    for node_id in range(node_id_begin, node_id_end+1):
+      for target_index in range(num_targets_per_disk):
+        target_id = calc_target_id(target_id_prefix, node_id, disk_index, target_index)
+        target_pos = (node_id - node_id_begin) * num_targets_per_disk + target_index
+
+        if chain_table_type == "EC":
+          group_slot_idx[groups[target_pos]] += 1
+          chain_index = (groups[target_pos]-1) * group_sizes[0] + group_slot_idx[groups[target_pos]]
+        else:
+          chain_index = groups[target_pos]
+
+        assert chain_index < 1_00_000, f"{chain_index} >= {1_00_000}"
+        chain_id = (chain_id_prefix * 1_000 + (disk_index+1)) * 1_00_000 + chain_index
+        target = Target(target_id, node_id, disk_index)
+        global_target_list.append(target)
+        chain_target_list[chain_id].append(target)
+
+  num_targets_on_node = list(Counter(target.node_id for target in global_target_list).values())
+  num_targets_on_disk = list(Counter((target.node_id, target.disk_index) for target in global_target_list).values())
+  assert len(global_target_list) == len(set(global_target_list)) == num_nodes * num_disks_per_node * num_targets_per_disk
+  assert all(x == num_targets_on_node[0] for x in num_targets_on_node[1:])
+  assert all(x == num_targets_on_disk[0] for x in num_targets_on_disk[1:])
+
+  if chain_table_type == "EC":
+    assert all(len(target_ids) == 1 for target_ids in chain_target_list.values())
+    assert len(chain_target_list) == num_nodes * num_disks_per_node * num_targets_per_disk
+  else:
+    assert all(len(target_ids) == group_sizes[0] for target_ids in chain_target_list.values())
+    assert len(chain_target_list) == num_nodes * num_disks_per_node * num_targets_per_disk // group_sizes[0]
+
+  return [Chain(chain_id, target_list) for chain_id, target_list in sorted(chain_target_list.items())]
+
+
+def main():
+  parser = argparse.ArgumentParser(prog="model.py", description="Generate 3FS create target commands")
+  parser.add_argument("-type", "--chain_table_type",  type=str, required=True, choices=["EC", "CR"], help="CR - Chain Replication; EC - Erasure Coding")
+  parser.add_argument("-b", "--node_id_begin", type=int, required=True, help="The first node id")
+  parser.add_argument("-e", "--node_id_end", type=int, required=True, help="The last node id")
+  parser.add_argument("-d", "--num_disks_per_node", type=int, required=True, help="Number of disk on each storage node")
+  parser.add_argument("-r", "--num_targets_per_disk", type=int, required=True, help="Number of storage targets on each disk")
+  parser.add_argument("-tp", "--target_id_prefix", type=int, default=10, help="Prefix of generated target id")
+  parser.add_argument("-cp", "--chain_id_prefix", type=int, default=10, help="Prefix of generated chain id")
+  parser.add_argument("-cs", "--chunk_size",  nargs="+", help="A list of supported file chunk sizes")
+  parser.add_argument("-mat", "--incidence_matrix_path",  type=str, required=True, help="Incidence matrix generated by data placement model")
+  parser.add_argument("-o", "--output_path", default="output", help="Path of output files")
+  args = parser.parse_args()
+
+  with open(args.incidence_matrix_path, "rb") as fin:
+    incidence_matrix = pickle.load(fin)
+
+  assert len(incidence_matrix) < 1_00_000
+  assert args.node_id_end - args.node_id_begin < 1000
+  assert args.node_id_end < 1_000_000
+  assert args.node_id_begin < 1_000_000
+  assert args.num_disks_per_node < 1000
+  assert args.num_targets_per_disk < 100
+  assert args.target_id_prefix < 100
+  assert args.chain_id_prefix < 100
+
+  chain_list = generate_chains(**vars(args), incidence_matrix=incidence_matrix)
+
+  with open(os.path.join(args.output_path, "generated_chains.csv"), "w") as fout:
+    print(f"ChainId,{','.join(['TargetId']*len(chain_list[0].target_list))}", file=fout)
+    for chain in chain_list:
+      print(f"{chain.chain_id},{','.join(str(target.target_id) for target in chain.target_list)}", file=fout)
+
+  with open(os.path.join(args.output_path, "generated_chain_table.csv"), "w") as fout:
+    print("ChainId", file=fout)
+    for chain in chain_list:
+      print(f"{chain.chain_id}", file=fout)
+
+  with open(os.path.join(args.output_path, "create_target_cmd.txt"), "w") as fout:
+    chunk_size_opt = f"--chunk-size {' '.join(args.chunk_size)}" if args.chunk_size else ""
+    for chain in chain_list:
+      for target in chain.target_list:
+        print(f"create-target --node-id {target.node_id} --disk-index {target.disk_index} --target-id {target.target_id} --chain-id {chain.chain_id} {chunk_size_opt} --use-new-chunk-engine", file=fout)
+
+  with open(os.path.join(args.output_path, "remove_target_cmd.txt"), "w") as fout:
+    for chain in chain_list:
+      for target in chain.target_list:
+        print(f"offline-target --node-id {target.node_id} --target-id {target.target_id}", file=fout)
+        print(f"remove-target --node-id {target.node_id} --target-id {target.target_id}", file=fout)
+
+
+if __name__ == "__main__":
+  main()
--- a/deploy/data_placement/test/test_model.py
+++ b/deploy/data_placement/test/test_model.py
@@ -0,0 +1,94 @@
+import copy
+import glob
+import os.path
+import importlib
+import shutil
+import tempfile
+import pytest
+from src.model.data_placement import DataPlacementModel, RebalanceTrafficModel
+
+
+placement_params = [
+  # simple cases for replication group
+  {
+    "chain_table_type": "EC",
+    "num_nodes": 5,
+    "num_targets_per_disk": 6,
+    "group_size": 2,
+  },
+  {
+    "chain_table_type": "EC",
+    "num_nodes": 5,
+    "num_targets_per_disk": 6,
+    "group_size": 3,
+  },
+  # not all targets used: num_nodes * num_targets_per_disk % group_size != 0
+  {
+    "chain_table_type": "EC",
+    "num_nodes": 7,
+    "num_targets_per_disk": 5,
+    "group_size": 4,
+  },
+  # always evenly distributed: num_targets_per_disk * (group_size-1) % (num_nodes-1) == 0
+  {
+    "chain_table_type": "EC",
+    "num_nodes": 8,
+    "num_targets_per_disk": 6,
+    "group_size": 5,
+  },
+  # all targets used & evenly distributed
+  {
+    "chain_table_type": "EC",
+    "num_nodes": 10,
+    "num_targets_per_disk": 9,
+    "group_size": 5,
+  },
+]
+qlinearize = [False, True]
+relax_lb = [1, 2]
+
+@pytest.mark.parametrize('qlinearize', qlinearize[1:])
+@pytest.mark.parametrize('relax_lb', relax_lb)
+@pytest.mark.parametrize('placement_params', placement_params)
+@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
+def test_solve_placement_model_with_highs(placement_params, qlinearize, relax_lb):
+  DataPlacementModel(
+    **placement_params,
+    qlinearize=qlinearize,
+    relax_lb=relax_lb,
+  ).run(pyomo_solver="appsi_highs")
+
+@pytest.mark.parametrize('chain_table_type, num_nodes, group_size', [("CR", 25, 3), ("EC", 25, 20)])
+@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
+def test_solve_placement_model_v25(chain_table_type, num_nodes, group_size):
+  model = DataPlacementModel(
+    chain_table_type=chain_table_type,
+    num_nodes=num_nodes,
+    group_size=group_size,
+    qlinearize=True,
+    relax_lb=1,
+    relax_ub=1,
+  )
+  model.run(pyomo_solver="appsi_highs", max_timelimit=30, auto_relax=True)
+
+@pytest.mark.parametrize('placement_params', placement_params)
+@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
+def test_solve_rebalance_model(placement_params):
+  model = DataPlacementModel(
+    **placement_params,
+    qlinearize=True,
+    relax_lb=1,
+    relax_ub=1,
+  )
+  instance = model.run(pyomo_solver="appsi_highs")
+
+  placement_params = copy.deepcopy(placement_params)
+  placement_params["num_nodes"] *= 2
+  placement_params.pop("num_targets_per_disk")
+  RebalanceTrafficModel(
+    existing_incidence_matrix=model.get_incidence_matrix(instance),
+    **placement_params,
+    qlinearize=True,
+    relax_lb=2,
+    relax_ub=1,
+  ).run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)
--- a/deploy/data_placement/test/test_plan.py
+++ b/deploy/data_placement/test/test_plan.py
@@ -0,0 +1,10 @@
+from smallpond.test_fabric import TestFabric
+from src.model.data_placement_job import search_data_placement_plans
+
+class TestPlan(TestFabric):
+
+  def test_search_data_placement_plans(self):
+    for pyomo_solver in ["appsi_highs"]:
+      with self.subTest(pyomo_solver=pyomo_solver):
+        plan = search_data_placement_plans(chain_table_type="EC", num_nodes=[10], group_size=[5, 9], solver_threads=16, pyomo_solver=pyomo_solver)
+        self.execute_plan(plan, num_executors=1)
--- a/deploy/data_placement/test/test_setup.py
+++ b/deploy/data_placement/test/test_setup.py
@@ -0,0 +1,55 @@
+from collections import Counter
+import glob
+import os.path
+import pytest
+
+from src.model.data_placement import DataPlacementModel
+from src.setup.gen_chain_table import generate_chains
+
+
+@pytest.mark.parametrize('num_nodes, num_disks_per_node, num_targets_per_disk, num_replicas', [(5, 10, 6, 2), (10, 10, 9, 3)])
+def test_generate_cr_chains(num_nodes: int, num_disks_per_node: int, num_targets_per_disk: int, num_replicas: int):
+  model = DataPlacementModel(
+    chain_table_type="CR",
+    num_nodes=num_nodes,
+    num_targets_per_disk=num_targets_per_disk,
+    group_size=num_replicas,
+    qlinearize=True,
+    relax_lb=1,
+    relax_ub=1,
+  )
+  instance = model.run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)
+
+  generate_chains(
+    chain_table_type="CR",
+    node_id_begin=1,
+    node_id_end=num_nodes,
+    num_disks_per_node=num_disks_per_node,
+    num_targets_per_disk=num_targets_per_disk,
+    target_id_prefix=1,
+    chain_id_prefix=9,
+    incidence_matrix=model.get_incidence_matrix(instance))
+
+
+@pytest.mark.parametrize('num_nodes, num_disks_per_node, num_targets_per_disk, ec_group_size', [(20, 10, 6, 12), (25, 10, 12, 20)])
+def test_generate_ec_chains(num_nodes: int, num_disks_per_node: int, num_targets_per_disk: int, ec_group_size: int):
+  model = DataPlacementModel(
+    chain_table_type="EC",
+    num_nodes=num_nodes,
+    num_targets_per_disk=num_targets_per_disk,
+    group_size=ec_group_size,
+    qlinearize=True,
+    relax_lb=1,
+    relax_ub=1,
+  )
+  instance = model.run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)
+
+  generate_chains(
+    chain_table_type="EC",
+    node_id_begin=1,
+    node_id_end=num_nodes,
+    num_disks_per_node=num_disks_per_node,
+    num_targets_per_disk=num_targets_per_disk,
+    target_id_prefix=1,
+    chain_id_prefix=9,
+    incidence_matrix=model.get_incidence_matrix(instance))
--- a/deploy/sql/3fs-monitor.sql
+++ b/deploy/sql/3fs-monitor.sql
@@ -0,0 +1,51 @@
+CREATE DATABASE IF NOT EXISTS 3fs;
+
+CREATE TABLE IF NOT EXISTS 3fs.counters (
+  `TIMESTAMP` DateTime CODEC(DoubleDelta),
+  `metricName` LowCardinality(String) CODEC(ZSTD(1)),
+  `host` LowCardinality(String) CODEC(ZSTD(1)),
+  `tag` LowCardinality(String) CODEC(ZSTD(1)),
+  `val` Int64 CODEC(ZSTD(1)),
+  `mount_name` LowCardinality(String) CODEC(ZSTD(1)),
+  `instance` String CODEC(ZSTD(1)),
+  `io` LowCardinality(String) CODEC(ZSTD(1)),
+  `uid` LowCardinality(String) CODEC(ZSTD(1)),
+  `pod` String CODEC(ZSTD(1)),
+  `thread` LowCardinality(String) CODEC(ZSTD(1)),
+  `statusCode` LowCardinality(String) CODEC(ZSTD(1))
+)
+ENGINE = MergeTree
+PRIMARY KEY (metricName, host, pod, instance, TIMESTAMP)
+PARTITION BY toDate(TIMESTAMP)
+ORDER BY (metricName, host, pod, instance, TIMESTAMP)
+TTL TIMESTAMP + toIntervalMonth(1)
+SETTINGS index_granularity = 8192;
+
+CREATE TABLE IF NOT EXISTS 3fs.distributions (
+  `TIMESTAMP` DateTime CODEC(DoubleDelta),
+  `metricName` LowCardinality(String) CODEC(ZSTD(1)),
+  `host` LowCardinality(String) CODEC(ZSTD(1)),
+  `tag` LowCardinality(String) CODEC(ZSTD(1)),
+  `count` Float64 CODEC(ZSTD(1)),
+  `mean` Float64 CODEC(ZSTD(1)),
+  `min` Float64 CODEC(ZSTD(1)),
+  `max` Float64 CODEC(ZSTD(1)),
+  `p50` Float64 CODEC(ZSTD(1)),
+  `p90` Float64 CODEC(ZSTD(1)),
+  `p95` Float64 CODEC(ZSTD(1)),
+  `p99` Float64 CODEC(ZSTD(1)),
+  `mount_name` LowCardinality(String) CODEC(ZSTD(1)),
+  `instance` String CODEC(ZSTD(1)),
+  `io` LowCardinality(String) CODEC(ZSTD(1)),
+  `uid` LowCardinality(String) CODEC(ZSTD(1)),
+  `method` LowCardinality(String) CODEC(ZSTD(1)),
+  `pod` String CODEC(ZSTD(1)),
+  `thread` LowCardinality(String) CODEC(ZSTD(1)),
+  `statusCode` LowCardinality(String) CODEC(ZSTD(1))
+)
+ENGINE = MergeTree
+PRIMARY KEY (metricName, host, pod, instance, TIMESTAMP)
+PARTITION BY toDate(TIMESTAMP)
+ORDER BY (metricName, host, pod, instance, TIMESTAMP)
+TTL TIMESTAMP + toIntervalMonth(1)
+SETTINGS index_granularity = 8192;
--- a/deploy/systemd/hf3fs_fuse_main.service
+++ b/deploy/systemd/hf3fs_fuse_main.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=fuse_main Server
+Requires=network-online.target
+After=network-online.target
+
+[Service]
+LimitNOFILE=1000000
+ExecStart=/opt/3fs/bin/hf3fs_fuse_main --launcher_cfg /opt/3fs/etc/hf3fs_fuse_main_launcher.toml
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
--- a/deploy/systemd/meta_main.service
+++ b/deploy/systemd/meta_main.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=meta_main Server
+Requires=network-online.target
+After=network-online.target
+
+[Service]
+LimitNOFILE=1000000
+ExecStart=/opt/3fs/bin/meta_main --launcher_cfg /opt/3fs/etc/meta_main_launcher.toml --app-cfg /opt/3fs/etc/meta_main_app.toml
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
--- a/deploy/systemd/mgmtd_main.service
+++ b/deploy/systemd/mgmtd_main.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=mgmtd_main Server
+Requires=network-online.target
+After=network-online.target
+
+[Service]
+LimitNOFILE=1000000
+ExecStart=/opt/3fs/bin/mgmtd_main --launcher_cfg /opt/3fs/etc/mgmtd_main_launcher.toml --app-cfg /opt/3fs/etc/mgmtd_main_app.toml
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
--- a/deploy/systemd/monitor_collector_main.service
+++ b/deploy/systemd/monitor_collector_main.service
@@ -0,0 +1,11 @@
+[Unit]
+Description=monitor_collector_main Server
+Requires=network-online.target
+After=network-online.target
+
+[Service]
+ExecStart=/opt/3fs/bin/monitor_collector_main --cfg /opt/3fs/etc/monitor_collector_main.toml
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
--- a/deploy/systemd/storage_main.service
+++ b/deploy/systemd/storage_main.service
@@ -0,0 +1,14 @@
+[Unit]
+Description=storage_main Server
+Requires=network-online.target
+After=network-online.target
+
+[Service]
+LimitNOFILE=1000000
+LimitMEMLOCK=infinity
+TimeoutStopSec=5m
+ExecStart=/opt/3fs/bin/storage_main --launcher_cfg /opt/3fs/etc/storage_main_launcher.toml --app-cfg /opt/3fs/etc/storage_main_app.toml
+Type=simple
+
+[Install]
+WantedBy=multi-user.target
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,6 @@
+# Documentation
+
+* [Design Notes](<design notes.md>)
+* [Setup Guide](../deploy/README.md)
+* [USRBIO API Reference](../src/lib/api/UsrbIo.md)
+* [P Specifications](../specs/README.md)
--- a/docs/design_notes.md
+++ b/docs/design_notes.md
@@ -0,0 +1,290 @@
+# Design Notes
+
+## Design and implementation
+
+The 3FS system has four components: cluster manager, metadata service, storage service and client. All components are connected in a RDMA network (InfiniBand or RoCE).
+
+Metadata and storage services send heartbeats to cluster manager. Cluster manager handles membership changes and distributes cluster configuration to other services and clients. Multiple cluster managers are deployed and one of them is elected as the primary. Another manager is promoted as primary when the primary fails. Cluster configuration is typically stored in a reliable distributed coordination service, such as ZooKeeper or etcd. In our production environment, we use the same key-value store as file metadata to reduce dependencies.
+
+File metadata operations (e.g. open or create files/directories) are sent to metadata services, which implement the file system semantics. Metadata services are stateless, since file metadata are stored in a transactional key-value store (e.g. FoundationDB). Clients can connect to any metadata service.
+
+Each storage service manages a few local SSDs and provides a chunk store interface. The storage service implements Chain Replication with Apportioned Queries (CRAQ) to ensure strong consistency. CRAQ’s write-all-read-any approach helps to unleash the throughput of SSDs and RDMA network. A 3FS file is split into equally sized chunks, which are replicated over multiple SSDs.
+
+Two clients are developed for applications: FUSE client and native client. Most applications use FUSE client, which has a low adoption barrier. Performance-critical applications are integrated with the native client.
+
+## File system interfaces
+
+Object store is becoming a popular option for data analytics and machine learning. However, file system semantics and a unified namespace where files are organized in directories provide greater flexibility for applications.
+
+-   *Atomic directory manipulation* An object store can approximate hierarchical directory structures by using slashes (/) in object keys. However, it doesn’t natively support operations like atomically moving files/directories, or recursively deleting entire directories. Actually a common pattern in our internal applications involves creating a temporary directory, writing files to it, and then moving the directory to its final location. When handling a large number of small files, the recursive delete for directories is crucial. Without it, applications have to traverse each directory and remove files one by one.
+
+-   *Symbolic and hard links* Our applications utilize symbolic and hard links to create lightweight snapshots of dynamically updated datasets, where new data is appended as individual files.
+
+-   *Familiar interface* The file interface is well known and used everywhere. There is no need to learn a new storage API. Many datasets are stored as CSV/Parquet files. Adapting file-based data loaders to use the 3FS FUSE client or native client is straightforward.
+
+### Limitations of FUSE
+
+FUSE (Filesystem in Userspace) simplifies file system client development by redirecting I/O operations to user-space processes through the FUSE kernel module. It creates the illusion that applications are accessing the remote file system as if it were a local file system. However, it has performance limitations:
+
+-   *Memory copy overhead* The user-space file system daemon cannot access application memory. Data transfer between kernel and user spaces consumes memory bandwidth and increases end-to-end latency.
+
+-   *Primitive multi-threading support* When an application initiates I/O requests, FUSE places these requests into a multi-threaded shared queue, protected by a spin lock. The user-space file system daemon then retrieves and processes requests from this queue. Due to lock contention, FUSE’s I/O processing capability fails to scale with the number of threads. Our benchmark results indicate that FUSE only handles approximately 400K 4KiB reads per second. Further increasing concurrency does not improve performance as lock contention intensifies. `perf` profiling reveals that the kernel-space spin lock consumes a significant amount of CPU time.
+
+Most applications, e.g. data analytics, perform large block writes on 3FS or they can buffer data in memory and flush it to 3FS when write buffer is full. However, FUSE on Linux 5.x does not support concurrent writes to the same file[^1]. Applications overcome this limitation by writing to multiple files concurrently, maximizing the total throughput.
+
+Read operations exhibit more complex patterns. Some training jobs require random access to dataset samples, with read sizes varying from a few kilobytes to several megabytes per sample. And samples are typically not 4K-aligned in files. Data loaders are specifically designed to fetch batches of samples. But they performs poorly when handling small random reads on FUSE-mounted 3FS. Bandwidth of SSDs and RDMA network are not fully utilized.
+
+### Asynchronous zero-copy API
+
+Implementing the file system client as a VFS kernel module avoids performance issues mentioned above. But kernel module development is significantly more challenging than user-space system programming. Bugs are difficult to diagnose and can lead to catastrophic failures in production environments. For example, machines may crash and leave no log message for debugging. When upgrading a kernel module, all processes using the file system must be stopped cleanly; otherwise, a machine restart is required.
+
+For these reasons, we have chosen to implement a native client within the FUSE daemon. This client offers an interface that supports asynchronous zero-copy I/O operations. File meta operations are still handled by FUSE daemon (e.g. open/close/stat files). Applications call `open()` to obtain a file descriptor (fd) and register it via native API. They can then perform I/O operations on the file with native client. This approach ensures consistency in metadata operations with the POSIX API, making it easier to migrate existing code.
+
+The asynchronous, zero-copy API is inspired by Linux `io_uring`. Below are the key data structures in the API:
+
+-   *Iov* A large memory region for zero-copy read/write operations, shared between the user process and the native client. InfiniBand memory registration is managed by the client. In native API, all read data will be read into Iov, and all write data should be written to Iov before calling the API.
+
+-   *Ior* A small shared ring buffer for communication between user process and native client. The usage of Ior is similar to Linux `io_uring`, where the user process enqueues read/write requests, and the native client dequeues these requests for completion. The requests are executed in batches, with their sizes controlled by the `io_depth` parameter. Multiple batches are processed in parallel, whether from different rings or the same ring. However, multiple rings are still recommended for multi-threaded applications, as sharing a ring requires synchronization, which can impact performance.
+
+Within the native client, multiple threads are spawned to fetch I/O requests from the Iors. These requests are batched and dispatched to storage services, reducing RPC overhead caused by small read requests.
+
+## File metadata store
+
+### Location of file chunks
+
+3FS divides file data into equally sized chunks and stripes them across multiple replication chains (replication chains and chain tables are defined in Section [Data placement](#data-placement). Users can specify the chain table, chunk size, and stripe size for files on a per-directory basis. Each chunk is independently stored on multiple storage services, with its chunk ID generated by concatenating the file’s inode id and chunk index.
+
+When creating a new file, the metadata service employs a round-robin strategy to select consecutive replication chains from the designated chain table, based on the stripe size. Next, a random seed is generated to shuffle the selected chains. This allocation strategy ensures balanced data distribution across chains and SSDs.
+
+When an application opens a file, the client contacts the meta service to obtain the file’s data layout information. Then the client can independently compute chunk IDs and chains for data operations, minimizing the involvement of the meta service in the critical path.
+
+### File metadata on transactional key-value store
+
+3FS uses FoundationDB as its distributed storage system for metadata. FoundationDB provides a key-value store interface and supports transactions with Serializable Snapshot Isolation (SSI). 3FS stores all metadata as key-value pairs in FoundationDB. Meta services follow a stateless architecture, greatly enhancing maintainability by allowing administrators to seamlessly upgrade or restart services without disruption. When clients experience request failures or timeouts, they can automatically fail over to other available services.
+
+The file system metadata primarily consists of two core structures: inodes and directory entries. Inodes store attribute information for files, directories, and symbolic links, each identified by a globally unique 64-bit identifier that increments monotonically. Inode keys are constructed by concatenating the "INOD" prefix with the inode id, which is encoded in little-endian byte order to spread inodes over multiple FoundationDB nodes. The inode values vary by its type:
+
+-   All inode types contain basic attributes: ownership, permissions, access/modification/change times.
+
+-   Additional attributes for file inodes: file length, chunk size, selected range in chain table, shuffle seed.
+
+-   Additional attributes for directory inodes: the parent directory’s inode id, default layout configurations for subdirectories/files (chain table, chunk size, stripe size). The parent’s inode id is required to detect loops when moving directories. When moving `dir_a/dir_b` to `dir_c/`, we need to ensure that `dir_c` is not a descendant of `dir_b`, which can be achieved by checking all ancestors of `dir_c` upward.
+
+-   Additional attributes for symbolic link inodes: target path string.
+
+Directory entry keys are composed of a "DENT" prefix, the parent inode ID, and the entry name. Directory entry values store the target inode id and inode type. All entries within a directory naturally form a contiguous key range, allowing efficient directory listing via range queries.
+
+The meta operations leverages FoundationDB’s transactions:
+
+-   Read-only transactions used for metadata queries: fstat, lookup, listdir etc.
+
+-   Read-write transactions used for metadata updates: create, link, unlink, rename etc.
+
+For write transactions, FoundationDB tracks the read/write key sets to form conflict detection sets. When concurrent transaction conflicts are detected, the meta service automatically retries the transaction. This design enables multiple meta services to process requests in parallel while maintaining file system metadata consistency.
+
+### Dynamic file attributes
+
+On most local file systems, deleting an opened file is deferred until all associated file descriptors are closed. Consequently, it is necessary to track all file descriptors of the file. Training jobs open a large number of files during startup. Storing all file descriptors would impose heavy load on meta service and FoundationDB. Since training jobs do not depend on this feature, 3FS does not track file descriptors opened in read-only mode.
+
+3FS maintains a file session for each file descriptor (fd) opened in write mode since deleting write opened files may lead to unreclaimable garbage chunks from concurrent writes. When a file with active write sessions is deleted, meta service delays the deletion until all its fds are closed. To prevent lingering sessions from offline clients, the 3FS meta service periodically checks client liveness and cleans up sessions of offline clients.
+
+The file length is stored in the inode. For files being actively updated, the length stored in inode may diverge from the actual length. Clients periodically (5 seconds by default) report to meta service maximum write position of each file opened in write mode. If this position exceeds the length in inode and there is no concurrent truncate operation, this position is adopted as the new file length.
+
+Due to the possibility of concurrent writes from multiple clients, the method described above ensures only eventual consistency for file lengths. When processing close/fsync operations, the meta service obtains the precise file length by querying the ID and length of the last chunk from the storage service. Since file data is striped across multiple chains, this operation incurs non-negligible overhead.
+
+Concurrent updates to the same file’s length by multiple meta services may cause transaction conflicts and lead to repeated file length computation. To mitigate this, meta service distributes file length update tasks across multiple meta services using inode IDs and the rendezvous hash algorithm.
+
+Our production environments use a large stripe size: 200. For small files, the number of chains containing file chunks is well below this number. The number of potentially used chains is stored in file inode and used as a hint when updating the length. It starts with an initial value of 16 and is doubled each time additional file chunks are written to more chains. This allows us to avoid querying all 200 chains when updating lengths of small files. This optimization can also be extended to the deletion of small files.
+
+## Chunk storage system
+
+The design goal of chunk storage system is to achieve the highest bandwidth possible even when there are storage medium failures. The read/write throughput of 3FS should scale linearly with the number of SSDs and bisection network bandwidth between clients and storage services. Applications access storage services in a locality-oblivious manner.
+
+### Data placement
+
+Each file chunk is replicated over a chain of storage targets using chain replication with apportioned queries (CRAQ). In CRAQ write requests are sent to the head target and propagated along a chain. Read requests can be sent to any of the storage target. Usually the read traffic is evenly distributed among all targets in a chain for better load balance. Multiple storage targets are created on each SSD and the targets join different chains.
+
+Suppose there are 6 nodes: A, B, C, D, E, F. Each node has 1 SSD. Create 5 storage targets on each SSD: 1, 2, ... 5. Then there are 30 targets in total: A1, A2, A3, ..., F5. If each chunk has 3 replicas, a chain table is constructed as follows.
+
+| Chain | Version | Target 1 (head) | Target 2 | Target 3 (tail) |
+| :---: | :-----: | :-------------: | :------: | :-------------: |
+|   1   |    1    |      `A1`       |   `B1`   |      `C1`       |
+|   2   |    1    |      `D1`       |   `E1`   |      `F1`       |
+|   3   |    1    |      `A2`       |   `B2`   |      `C2`       |
+|   4   |    1    |      `D2`       |   `E2`   |      `F2`       |
+|   5   |    1    |      `A3`       |   `B3`   |      `C3`       |
+|   6   |    1    |      `D3`       |   `E3`   |      `F3`       |
+|   7   |    1    |      `A4`       |   `B4`   |      `C4`       |
+|   8   |    1    |      `D4`       |   `E4`   |      `F4`       |
+|   9   |    1    |      `A5`       |   `B5`   |      `C5`       |
+|  10   |    1    |      `D5`       |   `E5`   |      `F5`       |
+
+Each chain has a version number. The version number is incremented if the chain is changed (e.g. a storage target is offline). Only the primary cluster manager makes changes to chain tables.
+
+A few chain tables can be constructed to support different data placement requirements. For example, two chain tables can be created, one for batch/offline jobs and another for online services. The two tables consist of storage targets on mutually exclusive nodes and SSDs.
+
+Logically, the state of each chain changes independently. Each chain can be included in multiple chain tables. The concept of chain table is created to let metadata service pick a table for each file and stripe file chunks across chains in the table.
+
+### Balanced traffic during recovery
+
+Suppose read traffic is evenly distributed among all storage targets in the above chain table. When A fails its read requests would be redirected to B and C. Under heavy load the read bandwidth of B, C is immediately saturated and B, C become the bottleneck of the entire system. Replacing a failed SSD and syncing data to the new SSD can take several hours. The read throughput is impaired during this period.
+
+To reduce the performance impact, we can have more SSDs share the redirected traffic. In the following chain table, A is paired with every other SSDs. When A fails, each of the other SSDs receives 1/5 of A’s read traffic.
+
+| Chain | Version | Target 1 (head) | Target 2 | Target 3 (tail) |
+| :---: | :-----: | :-------------: | :------: | :-------------: |
+|   1   |    1    |      `B1`       |   `E1`   |      `F1`       |
+|   2   |    1    |      `A1`       |   `B2`   |      `D1`       |
+|   3   |    1    |      `A2`       |   `D2`   |      `F2`       |
+|   4   |    1    |      `C1`       |   `D3`   |      `E2`       |
+|   5   |    1    |      `A3`       |   `C2`   |      `F3`       |
+|   6   |    1    |      `A4`       |   `B3`   |      `E3`       |
+|   7   |    1    |      `B4`       |   `C3`   |      `F4`       |
+|   8   |    1    |      `B5`       |   `C4`   |      `E4`       |
+|   9   |    1    |      `A5`       |   `C5`   |      `D4`       |
+|  10   |    1    |      `D5`       |   `E5`   |      `F5`       |
+
+To achieve maximum read throughput during recovery, the load balance problem can be formulated as a balanced incomplete block design. The optimal solution is obtained by using integer programming solver.
+
+### Data replication
+
+CRAQ is a write-all-read-any replication protocol optimized for read-heavy workloads. Utilizing read bandwidth of all replicas is critical to achieve highest read throughput in an all-flash storage system.
+
+When a write request is received by a storage service, it goes through the following steps:
+
+1.  The service checks if the chain version in write request matches with the latest known version; reject the request if it’s not. The write request could be sent by a client or a predecessor in the chain.
+
+2.  The service issues RDMA Read operations to pull write data. If the client/predecessor fails, the RDMA Read operations may time out and the write is aborted.
+
+3.  Once the write data is fetched into local memory buffer, a lock for the chunk to be updated is acquired from a lock manager. Concurrent writes to the same chunk are blocked. All writes are serialized at the head target.
+
+4.  The service reads the committed version of the chunk into memory, applies the update, and stores the updated chunk as a pending version. A storage target may store two versions of a chunk: a committed version and a pending version. Each version has a monotonically-increasing version number. The version numbers of committed version and pending versions are `v` and `u` respectively, and satisfy `u = v + 1`.
+
+5.  If the service is the tail, the committed version is atomically replaced by the pending version and an acknowledgment message is sent to the predecessor. Otherwise, the write request is forwarded to the successor. When the committed version is updated, the current chain version is stored as a field in the chunk metadata.
+
+6.  When an acknowledgment message arrives at a storage service, the service replaces the committed version with the pending version and continues to propagate the message to its predecessor. The local chunk lock is then released.
+
+Suppose there are 3 targets in the chain: `A, B, C`. A write request has just entered step 5 at `A`. `A` forwards the request to successor `B`. Then `B` instantly fails and the forwarded write request is lost. When cluster manager detects `B`’s failure, it marks `B` as offline and moves it to the end of chain and broadcasts the updated chain table. Once `A` receives the latest chain table, it forwards the write request to the new successor `C`. `C` may not receive the latest chain table yet and rejects the request. But `A` can keep forwarding the request to `C`. Eventually `C` gets the latest chain table and accepts the request.
+
+When a read request arrives at a storage service:
+
+1.  When the service only has a committed version of the chunk, this version is returned to the client.
+
+2.  Unlike CRAQ, our implementation does not issue version query to the tail target. When there are both committed and pending versions, the service replies a special status code to notify the client. The client may wait for a short interval and retry. Or the client can issue a relaxed read request to get the pending version.
+
+### Failure detection
+
+The cluster manager relies on heartbeats to detect fail-stop failures. Cluster manager declares a service failed if it does not receive heartbeats from it for a configurable interval (e.g. T seconds). A service stop processing requests and exits if it cannot communicate with cluster manager for T/2 seconds. The heartbeat can be seen as a request to \*renew a lease\* granted by the manager.
+
+The metadata services are stateless. The list of online meta services provided by cluster manager is a simple service discovery mechanism that helps clients create connections to metadata services. If one meta service is down, the clients may switch to any other metadata service.
+
+Cluster manager plays a more critical role in membership changes of storage services. It maintains a global view of chain tables and storage targets’ states. Each storage target has a public state and a local state.
+
+Public state indicates if it’s ready to serve read requests and if write requests would be propagated to it. Public states are stored with chain tables and distributed to services and clients.
+
+| Public State | Read | Write | Notes                                           |
+| :----------- | :--: | :---: | :---------------------------------------------- |
+| serving      |  Y   |   Y   | service alive and serving client requests       |
+| syncing      |  N   |   Y   | service alive and data recovery is in progress  |
+| waiting      |  N   |   N   | service alive and data recovery not started yet |
+| lastsrv      |  N   |   N   | service down and it was the last serving target |
+| offline      |  N   |   N   | service down or storage medium failure          |
+
+Local state is only known by storage services and cluster manager, and it’s stored in the memory of cluster manager. If a storage target has medium failure, the related service sets the target’s local state to offline in heartbeat. If a storage service is down, storage targets managed by the service are marked offline.
+
+| Local State | Notes                                                |
+| :---------- | :--------------------------------------------------- |
+| up-to-date  | service alive and serving client requests            |
+| online      | service alive and target in syncing or waiting state |
+| offline     | service down or storage medium failure               |
+
+A storage target can change from one public state to another in response to the latest local state. The local state plays the role of a triggering event. The cluster manager periodically scans every chain and updates the public states of targets on the chain according to a state-transition table.
+
+-   The chain version is incremented if the chain is updated.
+
+-   If a storage target is marked offline, it’s moved to the end of chain.
+
+-   If a storage service finds public state of any local storage target is lastsrv or offline, it exits immediately. The service may be isolated from the cluster manager by network partition error.
+
+-   Once the date recovery of a storage target in syncing state is completed, the storage service set the target’s local state to up-to-date in subsequent heartbeat messages sent to cluster manager.
+
+| Local State | Current Public State | Predecessor’s Public State | Next Public State |
+| :---------- | :------------------- | :------------------------- | :---------------- |
+| up-to-date  | serving              | (any)                      | serving           |
+|             | syncing              | (any)                      | serving           |
+|             | waiting              | (any)                      | waiting           |
+|             | lastsrv              | (any)                      | serving           |
+|             | offline              | (any)                      | waiting           |
+| online      | serving              | (any)                      | serving           |
+|             | syncing              | serving                    | syncing           |
+|             |                      | not serving                | waiting           |
+|             | waiting              | serving                    | syncing           |
+|             |                      | not serving                | waiting           |
+|             | lastsrv              | (any)                      | serving           |
+|             | offline              | (any)                      | waiting           |
+| offline     | serving              | has no predecessor         | lastsrv           |
+|             |                      | has predecessor            | offline           |
+|             | syncing              | (any)                      | offline           |
+|             | waiting              | (any)                      | offline           |
+|             | lastsrv              | (any)                      | lastsrv           |
+|             | offline              | (any)                      | offline           |
+
+### Data recovery
+
+When a storage service exits (e.g. process crashes or restarts during upgrade), or a storage medium failure occurs, all related storage targets will be marked as offline and moved to the end of chains by cluster manager. Once the service restarts, each target on the service enters into the recovery process independently. The entire recovery process overlaps with normal activity and minimizes any interruption.
+
+When a previously offline storage service starts:
+
+1.  The service periodically pulls latest chain tables from cluster manager. But it does not send heartbeats until all its storage targets have been marked offline in the latest chain tables. This ensures all its targets would go through the data recovery process.
+
+2.  When a write request arrives during recovery, the request is always a full-chunk-replace write. The local committed version is updated and any existing pending version is abandoned. Since current service is the tail, an acknowledgment message is sent to the predecessor. The full state of the predecessor is copied to the returning service through a continuous stream of full-chunk-replace writes.
+
+3.  Before the data recovery of a storage target starts, the predecessor sends a dump-chunkmeta request to the returning service. Then the service iterates the local chunk metadata store to collect the ids, chain versions and committed/pending version numbers of all chunks on the target, and replies the collected metadata to the predecessor.
+
+4.  When a sync-done message arrives, the service knows that the storage target is up-to-date. It sets local state of the target to up-to-date in heartbeat messages sent to cluster manager.
+
+When a storage service finds a previously offline successor is online:
+
+1. The service starts to forward normal write requests to the successor. Clients may only update a portion of the chunk, but the forwarded write requests should contains the whole chunk, i.e. a full-chunk-replace write.
+
+2. The service sends a dump-chunkmeta request to the successor. Once the metadata of all chunks on the successor target are received, it collects the chunk metadata on its local target. Then it compares the two copies of chunk metadata to decide which chunks should be transferred.
+
+3. The selected chunks are transferred to the successor by issuing full-chunk-replace write requests.
+
+   -   The chunk lock is first acquired for each chunk.
+
+   -   The chain version, commited version number and chunk content are read and transferred to successor by sending a full-chunk-replace request.
+
+   -   The chunk lock is released.
+
+   4\. When all required chunks have been transferred, a sync-done message is sent to the successor.
+
+The rules used to decide which chunks should be transferred are:
+
+-   If a chunk only exists on the local target, it should be transferred.
+
+-   If a chunk only exists on the remote target, it should be removed.
+
+-   If the chain version of local chunk replica is greater than that of the remote chunk replica, it should be transferred.
+
+-   If the chain versions of local/remote chunk replicas are the same but local committed version number does not equal to the remote pending version number, it should be transferred.
+
+-   Otherwise, two chunk replicas are either the same or being updated by in-progress write requests.
+
+### Chunks and the metadata
+
+File chunks are stored in the chunk engine. On each SSD, the persistent storage of the chunk engine consists of a fixed number of data files for storing chunk data, and a RocksDB instance for maintaining chunk metadata and other system information. Additionally, the chunk engine maintains an in-memory cache of chunk metadata to enhance query performance. A chunk allocator is implemented for fast allocation of new chunks. The chunk engine interface provides thread-safe access through the following operations:
+
+1.  *open/close* Initializes the engine by loading metadata from RocksDB and reconstructing chunk allocator states.
+
+2.  get: Retrieves chunk metadata and reference-counted handle through a hashmap cache, enabling concurrent access with O(1) average complexity.
+
+3.  *update* Implements copy-on-write (COW) semantics by allocating new chunks before modifying data. Old chunks remain readable until all handles are released.
+
+4.  *commit* Commit the updated chunk metadata to RocksDB via write batches to ensure atomic updates; synchronously refresh the chunk metadata cache.
+
+The chunk data will ultimately be stored on physical blocks. Physical block sizes range from 64KiB to 64MiB in increments of powers of two, totaling 11 distinct sizes. The allocator will assign physical blocks whose sizes most closely match the actual chunk size. A resource pool is constructed for each physical block size, with each pool containing 256 physical files. The usage status of physical blocks is maintained in memory using bitmaps. When a physical block is reclaimed, its bitmap flag is set to 0. The actual storage space of the block remains preserved and will be prioritized for subsequent allocations. When no available physical blocks remain, `fallocate()` will be used to allocate a contiguous large space in physical files, creating 256 new physical blocks - this approach helps reduce disk fragmentation.
+
+When performing write operations on a chunk, the allocator first assigns a new physical block. The system then reads existing chunk data into a buffer, apply the update, and writes the updated buffer to the newly allocated block. An optimized process is implemented for appends, where data is directly added in-place at the end of the existing block. A new copy of metadata is constructed from the new block's location and existing chunk metadata. Subsequently, both the new chunk metadata and statuses of new and old physical blocks are atomically updated in RocksDB.
+
+[^1]: https://elixir.bootlin.com/linux/v5.4.284/source/fs/fuse/file.c#L1573
--- a/docs/images/gray_sort_client.png
+++ b/docs/images/gray_sort_client.png
--- a/docs/images/gray_sort_server.png
+++ b/docs/images/gray_sort_server.png
--- a/docs/images/kvcache_gc_iops.png
+++ b/docs/images/kvcache_gc_iops.png
--- a/docs/images/kvcache_read_throughput.png
+++ b/docs/images/kvcache_read_throughput.png
--- a/docs/images/peak_throughput.jpg
+++ b/docs/images/peak_throughput.jpg
--- a/hf3fs/init.py
+++ b/hf3fs/init.py
@@ -0,0 +1,310 @@
+from hf3fs_py_usrbio import Client, iovec
+
+#import attrs
+from contextlib import contextmanager, AbstractContextManager
+from dataclasses import dataclass
+import functools
+import os
+from pathlib import PurePosixPath
+import threading as th
+
+from pkg_resources import get_distribution
+
+try:
+    __version__ = get_distribution('hf3fs').version
+except:
+    __version__ = "debug"
+
+DEFAULT_CLIENT = th.local()
+DEFAULT_CLIENT.client = None
+DEFAULT_CLIENT.clients = {}
+
+#@attrs.define
+@dataclass
+class MountInfo:
+    token: str
+    as_super: bool
+
+MOUNT_INFO = {}
+
+def _getDefaultClient(kwargs):
+    if 'client' in kwargs and kwargs['client'] is not None:
+        client = kwargs['client']
+        del kwargs['client']
+    elif 'mount_name' in kwargs and kwargs['mount_name'] is not None:
+        mount_name = kwargs['mount_name']
+        if mount_name not in DEFAULT_CLIENT.clients:
+            setupDefaultClient(mount_name)
+        client = DEFAULT_CLIENT.clients[mount_name]
+        del kwargs['mount_name']
+    elif DEFAULT_CLIENT.client is None:
+        raise RuntimeError("default client not setup")
+    else:
+        client = DEFAULT_CLIENT.client
+
+    return client, kwargs
+
+def _setupH3Method(name):
+    @functools.wraps(getattr(Client, name))
+    def wrapper(*args, **kwargs):
+        nonlocal name
+        client, kwargs = _getDefaultClient(kwargs)
+        return getattr(client, name)(*args, **kwargs)
+
+    globals()[name] = wrapper
+
+for _name in ['stat', 'fstat', 'mkdir', 'rmdir', 'unlink', 'remove', 'realpath', 'readlink', 'opendir', 'readdir',
+              'creat', 'symlink', 'link', 'open', 'close', 'chmod', 'chown', 'chdir', 'ftruncate',
+              'iovalloc', 'iovfree', 'preadv', 'pwritev']:
+    _setupH3Method(_name)
+
+def setMountInfo(mount_name, token, as_super=False):
+    global MOUNT_INFO
+    MOUNT_INFO[mount_name] = MountInfo(token, as_super)
+
+def setupDefaultClient(mount_name):
+    global MOUNT_INFO
+    if mount_name not in MOUNT_INFO:
+        raise ValueError(f"unknown mount name '{mount_name}'")
+
+    mount = MOUNT_INFO[mount_name]
+    
+    global DEFAULT_CLIENT
+    client = DEFAULT_CLIENT.clients[mount_name] = Client(mount_name, mount.token, as_super=mount.as_super)
+    return client
+
+@contextmanager
+def defaultClient(mount_name, token, as_super=False):
+    global DEFAULT_CLIENT
+    lastClient = DEFAULT_CLIENT.client
+    DEFAULT_CLIENT.client = Client(mount_name, token, as_super=as_super)
+
+    try:
+        yield DEFAULT_CLIENT.client
+    finally:
+        DEFAULT_CLIENT.client = lastClient
+
+def withClient(f):
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        nonlocal f
+        client, kwargs = _getDefaultClient(kwargs)
+        return f(client=client, *args, **kwargs)
+
+    return wrapper
+
+@withClient
+def listdir(path='.', client=None):
+    if path is None:
+        path = '.'
+
+    dirp = client.opendir(path)
+#    print(dirp, flush=True)
+    fileList = []
+    while True:
+        dent = client.readdir(dirp)
+#        print('dent', dent, flush=True)
+        if dent is None:
+            break
+        fileList.append(dent.d_name)
+
+    return fileList
+
+class DirEntry(object):
+    @withClient
+    def __init__(self, parentPath, name, etype, parentFd, client=None):
+        self._parentPath = PurePosixPath(parentPath)
+        self._name = name
+        self._etype = etype
+        self._parentFd = parentFd
+        self._client = client
+        self._st = None
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def path(self):
+        return str(self._parentPath / self._name)
+
+    _S_IFMT = 0o170000
+    _S_IFLNK = 0o120000
+    _S_IFREG = 0o100000
+    _S_IFDIR = 0o040000
+
+    _DT_DIR = 4
+    _DT_REG = 8
+    _DT_LNK = 10
+
+    def _checkWFollow(self, against, follow_symlinks):
+        if self._etype == against[0]:
+            return True
+        elif follow_symlinks and self.is_symlink():
+            st = self.stat(True)
+            return (st.st_mode & self._S_IFMT) == against[1]
+        else:
+            return False
+
+    def is_dir(self, follow_symlinks=True):
+        return self._checkWFollow((self._DT_DIR, self._S_IFDIR), follow_symlinks)
+
+    def is_file(self, follow_symlinks=True):
+        return self._checkWFollow((self._DT_REG, self._S_IFREG), follow_symlinks)
+
+    def is_symlink(self):
+        return self._etype == self._DT_LNK
+
+    def stat(self, follow_symlinks=True):
+        if self._st is None:
+            self._st = self._client.stat(self._name, dir_fd=self._parentFd, follow_symlinks=follow_symlinks)
+        return self._st
+
+@withClient
+def scandir(path='.', dir_fd=None, client=None):
+    class DirEntryIter(AbstractContextManager):
+        def __init__(self, path, client, dir_fd):
+            self._path = path
+            self._client = client
+            self._dir_fd = dir_fd
+            self._fd = None
+            self._dirp = None
+
+        @property
+        def dir_fd(self):
+            return self._fd
+
+        def close(self):
+            if self._fd is not None:
+                self._client.close(self._fd)
+                self._fd = None
+                
+            self._dirp = None
+
+        def __iter__(self):
+            self._fd = self._client.open(self._path, os.O_DIRECTORY | os.O_PATH, dir_fd=self._dir_fd)
+#            print('dirfd to scan', self._fd, flush=True)
+            self._dirp = self._client.opendir(self._path, dir_fd=self._dir_fd)
+            return self
+
+        def __next__(self):
+            dent = self._client.readdir(self._dirp)
+            if dent is None:
+                raise StopIteration()
+            return DirEntry(self._path, dent.d_name, dent.d_type, self._fd, client=self._client)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            self.close()
+            return False
+    
+    if path is None:
+        path = '.'
+
+    return DirEntryIter(path, client, dir_fd)
+
+@withClient
+def walk2(top, topdown=True, onerror=None, followlinks=False, dir_fd=None, curr_dir=None, client=None):
+    try:
+        with scandir(curr_dir if curr_dir is not None else top, dir_fd, client=client) as sd:
+            dirnames = []
+            filenames = []
+            for dent in sd:
+                name = dent.name
+                if dent.is_dir(followlinks):
+                    dirnames.append(name)
+                    if not topdown:
+                        yield from walk2(dent.path, False, onerror, followlinks, sd.dir_fd, name, client=client)
+                else:
+                    filenames.append(name)
+
+            yield (top, dirnames, filenames, sd.dir_fd)
+            if topdown:
+                topp = PurePosixPath(top)
+                for dirname in dirnames:
+                    yield from walk2(str(topp / dirname), True, onerror, followlinks, sd.dir_fd, dirname, client=client)
+    except OSError as e:
+        if onerror is not None:
+            onerror(e)
+
+@withClient
+def walk(top, topdown=True, onerror=None, followlinks=False, dir_fd=None, client=None):
+    for dp, dns, fns, dfd in walk2(top, topdown, onerror, followlinks, dir_fd, None, client=client):
+        yield (dp, dns, fns)
+
+class BinaryFile(AbstractContextManager):
+    @withClient
+    def __init__(self, path, mode, dir_fd=None, client=None, ignore_cache=False):
+        self.client = client
+
+        if mode == 'r':
+            flags = os.O_RDONLY
+        elif mode == 'r+':
+            flags = os.O_RDWR
+        elif mode == 'r+c':
+            flags = os.O_RDWR | os.O_CREAT
+        elif mode == 'w':
+            flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
+        elif mode == 'w+':
+            flags = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        else:
+            raise ValueError(f'invalid mode {mode}')
+
+        if ignore_cache:
+            flags |= os.O_NONBLOCK
+
+        self._fd = None
+        self._off = 0
+        
+        self._fd = client.open(path, flags, 0o644, dir_fd=dir_fd)
+
+    def __del__(self):
+        self.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def close(self):
+#        print('to close fd', self._fd, flush=True)
+        if self._fd is not None:
+            self.client.close(self._fd)
+            self._fd = None
+
+    def fileno(self):
+        return self._fd
+
+    def seek(self, pos, how=os.SEEK_SET, readahead=None):
+        self._off = self.client.lseek(self._fd, pos, how, readahead=readahead)
+        return self._off
+
+    def tell(self):
+        return self._off
+
+    def _bytesLeft(self):
+        off = self._off
+        flen = self.seek(0, os.SEEK_END)
+        self.seek(off)
+        return flen - off
+
+    def read(self, size=None, readahead=None):
+        if size is None:
+            size = self._bytesLeft()
+
+        buf = memoryview(bytearray(size))
+        red = self.readinto(buf, readahead=readahead)
+        return buf[:red]
+
+    def readinto(self, buf, readahead=None):
+        red = self.client.read(self._fd, buf, readahead=readahead)
+        self._off += red
+        return red
+
+    def write(self, buf, flush=False):
+        writ = self.client.write(self._fd, buf, flush=flush)
+        self._off += writ
+        return writ
--- a/hf3fs/fuse.py
+++ b/hf3fs/fuse.py
@@ -0,0 +1,44 @@
+import os
+from pathlib import PosixPath
+
+from hf3fs_py_usrbio import HF3FS_SUPER_MAGIC
+
+HF3FS_IOC_GET_MOUNT_NAME = 2149607424
+HF3FS_IOC_GET_PATH_OFFSET = 2147772417
+HF3FS_IOC_GET_MAGIC_NUM = 2147772418
+
+HF3FS_IOC_RECURSIVE_RM = 2147772426
+
+def serverPath(p):
+    '''
+    从完整路径获取 client 接受的路径名
+
+    Args:
+        p: 待解析的路径名
+
+    Examples:
+
+    .. code-block:: python
+
+        import hf3fs.fuse
+        hf3fs.fuse.serverPath('/hf3fs-cluster/aaa/../cpu/abc/def')
+    '''
+    np = os.path.normpath(os.path.realpath(p))
+    return os.path.join('/', *PosixPath(np).parts[3:])
+
+def mountName(p):
+    '''
+    从完整路径获取 mount name
+
+    Args:
+        p: 待解析的路径名
+
+    Examples:
+    
+    .. code-block:: python
+    
+        import hf3fs.fuse
+        hf3fs.fuse.mountName('/hf3fs-cluster/aaa/../cpu/abc/def')
+    '''
+    np = os.path.normpath(os.path.realpath(p))
+    return PosixPath(np).parts[2]
--- a/hf3fs_fuse/init.py
+++ b/hf3fs_fuse/init.py
--- a/hf3fs_fuse/fuse.py
+++ b/hf3fs_fuse/fuse.py
@@ -0,0 +1,7 @@
+import os
+from pathlib import PosixPath
+
+def get_mount_point(p):
+    np = os.path.realpath(p)
+    parts = PosixPath(np).parts
+    return os.path.join(*parts[:3])
--- a/hf3fs_fuse/fuse_demo.py
+++ b/hf3fs_fuse/fuse_demo.py
@@ -0,0 +1,30 @@
+from hf3fs_fuse.io import make_iovec, make_ioring, register_fd, deregister_fd
+from multiprocessing.shared_memory import SharedMemory
+import os
+
+# Create memory for IO
+shm = SharedMemory(size=1024, create=True)
+iov = make_iovec(shm, '/hf3fs-cluster', 0, -1) # shm, mountpoint, blocksize, numa
+shm.unlink() # shm can be unlinked after make_iovec
+
+# Create ioring for IO
+ior = make_ioring('/hf3fs-cluster', 100, True, 0) # mountpoint, num_entries, for_read, io_depth
+
+# Open file
+fd = os.open('/hf3fs-cluster/testread', os.O_RDONLY)
+register_fd(fd) # must register after open to use usrbio
+
+# Read file
+ios = [(iov[:512], fd, 512), (iov[512:], fd, 0)] # iov, fd, offset
+for io in ios:
+    ior.prepare(io[0], True, io[1], io[2], userdata=io) # iov, for_read, fd, offset, userdata
+    # Only for_read == True is allowed, because ior has for_read == True
+    # userdata must be a referenced python object, we reference io in the list ios, so it will not be sent to GC
+resv = ior.submit().wait(min_results=2)
+for res in resv:
+    print(res.result)
+    assert res.result == len(memoryview(res.userdata[0])) # Check read length is correct
+
+# Close file
+deregister_fd(fd) # must deregister before close
+os.close(fd)
--- a/hf3fs_fuse/io.py
+++ b/hf3fs_fuse/io.py
@@ -0,0 +1,139 @@
+import hf3fs_py_usrbio as h3fio
+from hf3fs_py_usrbio import register_fd, deregister_fd, force_fsync, extract_mount_point, hardlink, punch_hole
+
+import multiprocessing.shared_memory
+import os
+import os.path
+from uuid import uuid4
+
+class iovec:
+    def __init__(self, iov, link):
+        self.iov = iov
+        self.link = link
+
+    def __del__(self):
+        os.unlink(self.link)
+
+    def __getitem__(self, slice):
+        return self.iov[slice]
+
+    def __setitem__(self, slice, val):
+         self.iov[slice] = val
+
+class ioring:
+    def __init__(self, ior):
+        self.ior = ior
+
+    @staticmethod
+    def size_for_entries(entries):
+        return h3fio.ioring.size_for_entries(entries)
+
+    def prepare(self, iov, *args, **kwargs):
+        if type(iov) == iovec:
+            return self.ior.prepare(iov.iov, *args, **kwargs)
+        else:
+            return self.ior.prepare(iov, *args, **kwargs)
+
+    def submit(self):
+        return self.ior.submit()
+
+    def wait(self, *args, **kwargs):
+        return self.ior.wait(*args, **kwargs)
+
+class IorPriority(object):
+    HIGH = -1
+    NORMAL = 0
+    LOW = 1
+    
+def make_iovec(shm, hf3fs_mount_point, block_size=0, numa=-1):
+    '''
+    创建 iovec 对象
+
+    Args:
+        shm: Python multiprocessing.shared_memory.SharedMemory 对象
+        hf3fs_mount_point: hf3fs 挂载点
+        block_size: 默认为 0，代表整体视为一个 block，系统会按照 block_size 按块分配内存，防止触发 IB 注册驱动问题
+        numa: 默认为 -1，代表不进行 numa 绑定，指定此参数可以指定内存绑定到固定 numa
+    '''
+    id = str(uuid4())
+    target = os.path.normpath(f'/dev/shm/{shm.name}')
+    link = f'{hf3fs_mount_point}/3fs-virt/iovs/{id}{f".b{block_size}" if block_size > 0 else ""}'
+
+    os.symlink(target, link)
+
+    return iovec(h3fio.iovec(shm.buf, id, hf3fs_mount_point, block_size, numa), link)
+
+def make_ioring(hf3fs_mount_point, entries, for_read=True, io_depth=0, priority=None, timeout=None, numa=-1, flags=0):
+    '''
+    创建 ioring 对象
+    可以用 io_depth 参数来控制读取策略，有以下三种情况：
+    io_depth = 0 时，ioring 每次后台扫描任务或被通知有后台任务时，提交全部 io
+    io_depth > 0 时，ioring 每次提交 io_depth 个 io，用户需保证最终有足量任务，否则 ioring 在 wait 时会卡住
+    io_depth < 0 时，ioring 每次后台扫描任务或被通知有后台任务时，提交最多 -io_depth 个 io
+
+    Args:
+        hf3fs_mount_point: hf3fs 挂载点
+        entries: ioring 最大存放 io 操作的个数
+        for_read: 指定该 ioring 执行的操作，读为 True，写为 False
+        io_depth: 指定读取策略
+        numa: 默认为 -1，代表不进行 numa 绑定，指定此参数可以指定将 ioring 通信使用的内存绑定到固定 numa
+        flags: 读写操作的一些额外选项，现在有用取值的主要是2可以在读到洞的时候报错而不是填0
+    '''
+    return ioring(h3fio.ioring(hf3fs_mount_point, entries, for_read, io_depth, priority, timeout, numa, flags))
+
+# @param cb if set, will callback with read data and current offset, will return None
+# suggested use is to use cb 
+def read_file(fn, hf3fs_mount_point=None, block_size=1 << 30, off=0, priority=None, cb=None):
+    if hf3fs_mount_point is None:
+        hf3fs_mount_point = extract_mount_point(fn)
+
+    bufs = []
+
+    try:
+        fd = os.open(fn, os.O_RDONLY)
+        register_fd(fd)
+        shm = multiprocessing.shared_memory.SharedMemory(size=block_size, create=True)
+        iov = make_iovec(shm, hf3fs_mount_point)
+        ior = make_ioring(hf3fs_mount_point, 1, priority=priority)
+
+        i = 0
+        roff = off
+        while True:
+            ior.prepare(iov[:], True, fd, roff)
+            done = ior.submit().wait(min_results=1)[0]
+            if done.result < 0:
+                raise OSError(-done.result)
+
+            if done.result == 0:
+                break
+
+            if cb is None:
+                bufs.append(bytes(shm.buf[:done.result]))
+            else:
+                res = cb(shm.buf[:done.result], roff)
+                if type(res) == int:
+                    roff = res
+                    continue
+                elif res:
+                    return
+
+            if done.result < block_size:
+                break
+
+            i += 1
+            roff += block_size
+
+        if cb is not None:
+            return
+        
+        if len(bufs) == 1:
+            return bufs[0]
+        else:
+            return b''.join(bufs)
+    finally:
+        deregister_fd(fd)
+        os.close(fd)
+        del ior
+        del iov
+        shm.close()
+        shm.unlink()
--- a/hf3fs_utils/README.md
+++ b/hf3fs_utils/README.md
@@ -0,0 +1,37 @@
+# hf3fs_cli
+
+build:
+```bash
+python3 setup_hf3fs_utils.py bdist_wheel
+```
+
+usage:
+```.bash
+$ hf3fs_cli rmtree --help
+Usage: hf3fs_cli rmtree [OPTIONS] [DIR_PATHS]...
+
+  Move a directory tree to the trash and set an expiration time, it will be automatically deleted after expiration
+
+  Example:
+  hf3fs_cli rmtree <path/to/remove> --expire <expire_time>
+
+  - Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time, the directory will be deleted after expiration.
+  - Before expiration, you can restore the directory from the trash using `hf3fs_cli mv <trash_path> <target_path>`.
+  - If you need to free up space immediately, you can use `hf3fs_cli rmtree <trash_path>` to delete the data in the trash immediately, this operation cannot be undone!
+  - Use `ls /path/to/hf3fs/trash` to view the trash.
+
+Options:
+  --expire [1h|3h|8h|1d|3d|7d]  Expiration time, contents in the trash will be automatically deleted after expiration
+  -y, --yes                     Skip confirmation prompt and delete immediately
+  --help                        Show this message and exit.
+
+$ hf3fs_cli mv --help
+Usage: hf3fs_cli mv [OPTIONS] OLD_PATH NEW_PATH
+
+  Move files, supports moving files between different mount points within the same 3FS
+
+Options:
+  --help  Show this message and exit.
+```
+
+If you want to use `rmtree` command, the administrator needs to create a trash directory for each user at `/{3fs_mountpoint}/trash/{user_name}`. The cleanup of the trash directory is handled by the `trash_cleaner`. For instructions on how to use it, please refer to `src/client/trash_cleaner/`.
--- a/hf3fs_utils/init.py
+++ b/hf3fs_utils/init.py
--- a/hf3fs_utils/cli.py
+++ b/hf3fs_utils/cli.py
@@ -0,0 +1,192 @@
+import errno
+import click
+import os
+import sys
+import stat
+from typing import Optional, List
+from hf3fs_utils.fs import is_relative_to, FileSystem
+from hf3fs_utils.trash import TRASH_CONFIGS, Trash
+
+MOUNTPOINT = os.environ.get("HF3FS_CLI_MOUNTPOINT", None)
+
+
+def get_filesystem(path: str) -> FileSystem:
+    mountpoint = None
+    if MOUNTPOINT is not None:
+        mountpoint = os.path.abspath(MOUNTPOINT)
+    else:
+        path = os.path.realpath(path)
+        parts = path.split(os.sep)
+        for i in range(1, 4):
+            p = os.sep.join(parts[:i])
+            if os.path.exists(os.path.join(p, "3fs-virt")):
+                mountpoint = p
+            break
+    if not mountpoint:
+        abort(f"{path} is not on 3FS")
+    return FileSystem(mountpoint)
+
+
+def abs_path(path: str) -> str:
+    if ".." in path.split(os.path.sep):
+        abort(f"Path {path} contains '..', which is not supported yet")
+    normpath = os.path.normpath(path)
+    # If the user calls rmtree path/symlink, it should delete the symlink instead of the path it points to
+    # For dir paths, take the realpath, but keep the filename as is
+    dir = os.path.dirname(normpath)
+    filename = os.path.basename(normpath)
+    return os.path.join(os.path.realpath(dir), filename)
+
+
+def abort(msg):
+    click.echo(click.style(msg, fg="red"), err=True)
+    sys.exit(1)
+
+
+@click.group()
+def cli():
+    """
+    3FS command-line tool
+    """
+
+
+@cli.command()
+@click.argument("old_path", type=click.Path(exists=True))
+@click.argument("new_path", type=click.Path())
+def mv(old_path: str, new_path: str):
+    """
+    Move files, supports moving files between different mount points within the same 3FS
+    """
+    try:
+        old_path = abs_path(old_path)
+        new_path = abs_path(new_path)
+
+        try:
+            new_st = os.stat(new_path, follow_symlinks=True)
+            # new_path exists, should be a directory
+            if not stat.S_ISDIR(new_st.st_mode):
+                raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), new_path)
+            # move to new_path/filename
+            new_path = os.path.join(new_path, os.path.basename(old_path))
+        except FileNotFoundError:
+            pass
+
+        fs = get_filesystem(old_path)
+        fs.rename(old_path, new_path)
+        click.echo(f"Move successful: {old_path} -> {new_path}")
+    except AssertionError:
+        raise
+    except Exception as ex:
+        abort(f"Move failed: {ex}")
+
+
+class ExpireType(click.ParamType):
+    def get_metavar(self, param) -> str:
+        return "[1h|3h|8h|1d|3d|7d]"
+
+    def convert(self, value, param, ctx):
+        norm_value = value
+        if norm_value.endswith("hour"):
+            norm_value = norm_value.replace("hour", "h")
+        elif norm_value.endswith("hours"):
+            norm_value = norm_value.replace("hours", "h")
+        elif norm_value.endswith("day"):
+            norm_value = norm_value.replace("day", "d")
+        elif norm_value.endswith("days"):
+            norm_value = norm_value.replace("days", "d")
+
+        if norm_value not in TRASH_CONFIGS.keys():
+            self.fail(f"{value} is invalid, valid options are {self.get_metavar()}", param, ctx)
+        else:
+            return norm_value
+
+
+@cli.command()
+@click.argument("dir_paths", type=click.Path(exists=True), nargs=-1)
+@click.option(
+    "--expire",
+    type=ExpireType(),
+    help="Expiration time, contents in the trash will be automatically deleted after expiration",
+)
+@click.option("-y", "--yes", is_flag=True, default=False, help="Skip confirmation prompt and delete immediately")
+def rmtree(dir_paths: List[str], expire: Optional[str], yes: bool):
+    """
+    Move a directory tree to the trash and set an expiration time, it will be automatically deleted after expiration
+
+    \b
+    Example:
+    hf3fs_cli rmtree <path/to/remove> --expire <expire_time>
+
+    \b
+    - Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time, the directory will be deleted after expiration.
+    - Before expiration, you can restore the directory from the trash using `hf3fs_cli mv <trash_path> <target_path>`.
+    - If you need to free up space immediately, you can use `hf3fs_cli rmtree <trash_path>` to delete the data in the trash immediately, this operation cannot be undone!
+    - Use `ls /hf3fs/{cluster}/trash` to view the trash.
+    """
+
+    if not dir_paths:
+        abort(f"Please provide the directory path to delete")
+
+    first_path = abs_path(dir_paths[0])
+    fs = get_filesystem(first_path)
+    fs_trash = Trash(fs)
+
+    clean_trash = is_relative_to(first_path, fs_trash.trash_path)
+    if not clean_trash:
+        if not expire:
+            abort(f"Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time")
+    elif expire:
+        abort(f"{first_path} is already in the trash")
+    trash_cfg = TRASH_CONFIGS[expire] if not clean_trash else None
+
+    dir_paths = [abs_path(p) for p in dir_paths]
+    for dir_path in dir_paths:
+        if is_relative_to(dir_path, fs_trash.trash_path) != clean_trash:
+            if clean_trash:
+                abort(f"{dir_path} is not in the trash")
+            else:
+                abort(f"{dir_path} is already in the trash")
+
+    if clean_trash:
+        if len(dir_paths) != 1:
+            msg = (
+                f"Immediately delete the following paths:\n"
+                + "\n".join([f"- {p}" for p in dir_paths])
+                + "\nThis operation cannot be undone"
+            )
+        else:
+            msg = f"Immediately delete {dir_path}, this operation cannot be undone"
+    else:
+        if len(dir_paths) != 1:
+            msg = (
+                f"Move the following paths to the trash:\n"
+                + "\n".join([f"- {p}" for p in dir_paths])
+                + f"\nThey will be automatically deleted after {expire}"
+            )
+        else:
+            msg = f"Move {dir_path} to the trash, it will be automatically deleted after {expire}"
+    if not yes:
+        assert click.confirm(msg, abort=True)
+
+    for dir_path in dir_paths:
+        try:
+            if clean_trash:
+                fs.remove(dir_path, recursive=True)
+                click.echo(f"- Deleted {dir_path}")
+            else:
+                trash_path = fs_trash.move_to_trash(dir_path, trash_cfg)
+                click.echo(f"- Trash path: {trash_path}")
+        except AssertionError:
+            raise
+        except Exception as ex:
+            abort(f"Failed to delete {dir_path}: {ex}")
+
+    if not clean_trash:
+        click.echo(
+            "- Before expiration, you can use 'hf3fs_cli mv <trash_path> <target_path>' to restore, "
+            "or use 'hf3fs_cli rmtree <trash_path>' to delete immediately and free up space"
+        )
+
+
+if __name__ == "__main__":
+    cli()
--- a/hf3fs_utils/fs.py
+++ b/hf3fs_utils/fs.py
@@ -0,0 +1,234 @@
+import os
+import fcntl
+import errno
+import struct
+import stat
+import sys
+import pathlib
+from typing import Tuple
+
+
+def is_relative_to(path1, path2) -> bool:
+    try:
+        pathlib.PurePath(path1).relative_to(path2)
+        return True
+    except:
+        return False
+
+
+class FileSystem:
+    HF3FS_IOCTL_MAGIC_CMD = 0x80046802
+    HF3FS_IOCTL_MAGIC_NUM = 0x8F3F5FFF
+
+    HF3FS_IOCTL_VERSION_CMD = 0x80046803
+
+    HF3FS_IOCTL_RENAME_CMD = 0x4218680E
+    HF3FS_IOCTL_RENAME_BUFFER_SIZE = 536
+
+    HF3FS_IOCTL_REMOVE_CMD = 0x4110680F
+    HF3FS_IOCTL_REMOVE_BUFFER_SIZE = 272
+
+    def __init__(self, mountpoint: str) -> None:
+        self.mountpoint = os.path.realpath(mountpoint)
+        self.virt_path = os.path.join(self.mountpoint, "3fs-virt")
+
+        # Check if the mount point is a directory
+        if not os.path.exists(self.mountpoint):
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), self.mountpoint
+            )
+        if not os.path.isdir(self.mountpoint):
+            raise NotADirectoryError(
+                errno.ENOTDIR, os.strerror(errno.ENOTDIR), self.mountpoint
+            )
+
+        virt_fd = None
+        try:
+            # Check the 3fs-virt directory
+            virt_fd = os.open(self.virt_path, os.O_RDONLY | os.O_DIRECTORY)
+            virt_st = os.fstat(virt_fd)
+            if not stat.S_ISDIR(virt_st.st_mode):
+                raise NotADirectoryError(
+                    errno.ENOTDIR, os.strerror(errno.ENOTDIR), self.virt_path
+                )
+            self.st_dev = virt_st.st_dev
+            # Check the magic number
+            buffer = bytearray(4)
+            try:
+                self._ioctl(virt_fd, FileSystem.HF3FS_IOCTL_MAGIC_CMD, buffer)
+            except OSError:
+                raise RuntimeError(f"{self.mountpoint} is not a 3FS mount point")
+            magic_number = struct.unpack("I", buffer)[0]
+            expected_magic_number = FileSystem.HF3FS_IOCTL_MAGIC_NUM
+            if magic_number != expected_magic_number:
+                raise RuntimeError(
+                    f"{self.mountpoint} is not a 3FS mount point, "
+                    f"magic number {magic_number:x} != {expected_magic_number:x}"
+                )
+            # Check if the required ioctl is supported
+            ioctl_version = -1
+            try:
+                buffer = bytearray(4)
+                self._ioctl(virt_fd, FileSystem.HF3FS_IOCTL_VERSION_CMD, buffer)
+                ioctl_version = int.from_bytes(buffer, sys.byteorder, signed=False)
+            except OSError:
+                pass
+            assert ioctl_version >= 1
+        finally:
+            if virt_fd is not None:
+                os.close(virt_fd)
+
+    def _check_user(self):
+        if os.geteuid() == 0 or os.getegid() == 0:
+            raise RuntimeError(f"root user not allowed")
+
+    def _encode_filename(self, name: str) -> bytes:
+        assert name and os.sep not in name, name
+        name_bytes = name.encode("utf8")
+        if len(name_bytes) > 255:
+            raise OSError(errno.ENAMETOOLONG, os.strerror(errno.ENAMETOOLONG), name)
+        return name_bytes
+
+    def opendir(self, dir_path: str) -> Tuple[int, os.stat_result]:
+        dir_fd = os.open(dir_path, os.O_DIRECTORY | os.O_RDONLY)
+        try:
+            try:
+                dir_st = os.fstat(dir_fd)
+            except OSError as ex:
+                ex.filename = dir_path
+                raise
+
+            if not stat.S_ISDIR(dir_st.st_mode):
+                raise NotADirectoryError(
+                    errno.ENOTDIR, os.strerror(errno.ENOTDIR), dir_path
+                )
+            if dir_st.st_dev != self.st_dev:
+                raise RuntimeError(f"{dir_path} is not under the 3FS mount point {self.mountpoint}")
+            if dir_st.st_ino & 0xF000000000000000:
+                raise RuntimeError(f"{dir_path} is a virtual path")
+
+            return dir_fd, dir_st
+        except:
+            os.close(dir_fd)
+            raise
+
+    def split_path(self, path: str) -> Tuple[int, os.stat_result, str]:
+        filename = os.path.basename(path)
+        if not filename:
+            raise RuntimeError(f"{path} has no filename")
+        if filename in [".", ".."]:
+            raise RuntimeError(f"{path} filename is {filename}")
+        if len(filename.encode("utf8")) > 255:
+            raise OSError(errno.ENAMETOOLONG, os.strerror(errno.ENAMETOOLONG), path)
+
+        dir = os.path.dirname(path) or "."
+        dir_fd, dir_st = self.opendir(dir)
+        return dir_fd, dir_st, filename
+
+    def rename(self, old_path: str, new_path: str) -> None:
+        self._check_user()
+        if is_relative_to(
+            os.path.realpath(new_path), os.path.join(self.mountpoint, "trash")
+        ):
+            raise RuntimeError(f"{new_path} is in the trash")
+
+        old_dir_fd = None
+        new_dir_fd = None
+        try:
+            old_dir_fd, old_dir_st, old_filename = self.split_path(old_path)
+            new_dir_fd, new_dir_st, new_filename = self.split_path(new_path)
+
+            try:
+                old_st = os.stat(old_filename, dir_fd=old_dir_fd, follow_symlinks=False)
+                if stat.S_ISLNK(old_st.st_mode):
+                    raise RuntimeError(f"{old_path} is symlink")
+            except OSError as ex:
+                ex.filename = old_path
+                raise
+
+            try:
+                os.stat(new_filename, dir_fd=new_dir_fd, follow_symlinks=False)
+                raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), new_path)
+            except FileNotFoundError:
+                pass
+            except OSError as ex:
+                ex.filename = new_path
+                raise
+
+            try:
+                self._rename_ioctl(
+                    old_dir_fd,
+                    old_dir_st.st_ino,
+                    old_filename,
+                    new_dir_st.st_ino,
+                    new_filename,
+                    False,
+                )
+            except OSError as ex:
+                ex.filename = old_path
+                ex.filename2 = new_path
+                raise
+        finally:
+            if old_dir_fd is not None:
+                os.close(old_dir_fd)
+            if new_dir_fd is not None:
+                os.close(new_dir_fd)
+
+    def _rename_ioctl(
+        self,
+        old_dir_fd: int,
+        old_dir_ino: int,
+        old_filename: str,
+        new_dir_ino: int,
+        new_filename: str,
+        move_to_trash: bool,
+    ) -> None:
+        assert old_filename and not os.path.sep in old_filename, old_filename
+        assert new_filename and not os.path.sep in new_filename, new_filename
+
+        cmd = FileSystem.HF3FS_IOCTL_RENAME_CMD
+        buffer = struct.pack(
+            "N256sN256s?",
+            old_dir_ino,
+            self._encode_filename(old_filename),
+            new_dir_ino,
+            self._encode_filename(new_filename),
+            move_to_trash,
+        ).ljust(FileSystem.HF3FS_IOCTL_RENAME_BUFFER_SIZE)
+        self._ioctl(old_dir_fd, cmd, buffer)
+
+    def remove(self, path: str, recursive: bool) -> None:
+        dir_fd = None
+        try:
+            dir_fd, dir_st, filename = self.split_path(path)
+            st = os.stat(filename, dir_fd=dir_fd, follow_symlinks=False)
+            if stat.S_ISLNK(st.st_mode):
+                raise RuntimeError(f"{path} is symlink")
+            if stat.S_ISDIR(st.st_mode) and recursive:
+                # The user must be the owner of the directory and have rwx permissions
+                imode = stat.S_IMODE(st.st_mode)
+                if st.st_uid != os.geteuid() or (imode & 0o700) != 0o700:
+                    raise PermissionError(errno.EPERM, os.strerror(errno.EPERM), path)
+
+            try:
+                self._remove_ioctl(dir_fd, dir_st.st_ino, filename, recursive)
+            except OSError as ex:
+                ex.filename = path
+                raise ex
+        finally:
+            if dir_fd is not None:
+                os.close(dir_fd)
+
+    def _remove_ioctl(
+        self, parent_fd: int, parent_ino: int, filename: str, recursive: bool
+    ) -> None:
+        assert filename and os.sep not in filename, filename
+        cmd = FileSystem.HF3FS_IOCTL_REMOVE_CMD
+        buffer = struct.pack(
+            "N256s?", parent_ino, self._encode_filename(filename), recursive
+        ).ljust(FileSystem.HF3FS_IOCTL_REMOVE_BUFFER_SIZE)
+        self._ioctl(parent_fd, cmd, buffer)
+
+    def _ioctl(self, fd: int, cmd: int, buffer):
+        self._check_user()
+        return fcntl.ioctl(fd, cmd, buffer)
--- a/hf3fs_utils/hf3fs_cli
+++ b/hf3fs_utils/hf3fs_cli
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+from hf3fs_utils import cli
+
+if __name__ == "__main__":
+    cli.cli(max_content_width=120)
--- a/hf3fs_utils/trash.py
+++ b/hf3fs_utils/trash.py
@@ -0,0 +1,176 @@
+import os
+import dataclasses
+import pwd
+import stat
+import errno
+import time
+from typing import Optional
+from datetime import datetime, timedelta, timezone
+from . import fs
+
+UTC8_TZ = timezone(timedelta(hours=8))
+DATE_FORMAT = "%Y%m%d_%H%M"
+BASE_TIMESTAMP = int(datetime(year=1980, month=1, day=1, tzinfo=UTC8_TZ).timestamp())
+
+
+def format_date(t: datetime) -> str:
+    assert t.tzinfo
+    return t.astimezone(tz=UTC8_TZ).strftime(DATE_FORMAT)
+
+
+def parse_date(t: str) -> datetime:
+    return datetime.strptime(t, DATE_FORMAT).replace(tzinfo=UTC8_TZ)
+
+
+def get_timestamp_us() -> int:
+    timestamp_seconds = time.time()
+    return int(timestamp_seconds * 1_000_000)
+
+
+@dataclasses.dataclass
+class TrashConfig:
+    name: str
+    expire: timedelta
+    time_slice: timedelta
+
+    def __post_init__(self):
+        assert self.name and "-" not in self.name, f"invalid name {self.name}"
+        assert self.expire >= timedelta(minutes=1), self.expire
+        assert self.time_slice >= timedelta(minutes=1), self.time_slice
+        assert self.time_slice < self.expire, (self.time_slice, self.expire)
+
+    def current_dir(self) -> str:
+        base_timestamp = BASE_TIMESTAMP
+        current_timestamp = int(datetime.now(tz=UTC8_TZ).timestamp())
+        assert current_timestamp > base_timestamp, current_timestamp
+
+        time_slice_seconds = int(self.time_slice.total_seconds())
+        expire_seconds = int(self.expire.total_seconds())
+        assert time_slice_seconds and expire_seconds, repr(self)
+        start_timestamp = (
+            (current_timestamp - base_timestamp) // time_slice_seconds
+        ) * time_slice_seconds + base_timestamp
+        end_timestamp = start_timestamp + expire_seconds + time_slice_seconds
+        start_datetime = datetime.fromtimestamp(start_timestamp, tz=UTC8_TZ)
+        end_datetime = datetime.fromtimestamp(end_timestamp, tz=UTC8_TZ)
+
+        return f"{self.name}-{format_date(start_datetime)}-{format_date(end_datetime)}"
+
+
+TRASH_CONFIGS = {
+    "1h": TrashConfig("1h", timedelta(hours=1), timedelta(minutes=10)),
+    "3h": TrashConfig("3h", timedelta(hours=3), timedelta(minutes=30)),
+    "8h": TrashConfig("8h", timedelta(hours=8), timedelta(minutes=30)),
+    "1d": TrashConfig("1d", timedelta(days=1), timedelta(hours=1)),
+    "3d": TrashConfig("3d", timedelta(days=3), timedelta(days=1)),
+    "7d": TrashConfig("7d", timedelta(days=7), timedelta(days=1)),
+}
+
+
+class Trash:
+    def __init__(
+        self,
+        filesystem: fs.FileSystem,
+        user: Optional[int] = None,
+        user_name: Optional[str] = None,
+    ) -> None:
+        if user is None:
+            user = os.geteuid()
+        assert isinstance(user, int), user
+        if user_name is None:
+            user_name = pwd.getpwuid(user).pw_name
+        if user == 0:
+            raise RuntimeError(f"hf3fs trash does not support root user")
+
+        # Check if the trash directory is mounted
+        trash = os.path.join(filesystem.mountpoint, "trash")
+        if not os.path.exists(trash):
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), trash)
+
+        # Check if the user's trash directory exists
+        user_trash = os.path.join(filesystem.mountpoint, "trash", user_name)
+        if not os.path.exists(user_trash):
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), user_trash)
+
+        user_trash_fd, user_trash_st = filesystem.opendir(user_trash)
+        os.close(user_trash_fd)
+        assert stat.S_ISDIR(user_trash_st.st_mode)
+        if user_trash_st.st_uid != user:
+            raise RuntimeError(
+                f"Trash directory {user_trash}, owner {user_trash_st.st_uid} != {user}"
+            )
+
+        self.filesystem = filesystem
+        self.user = user
+        self.user_name = user_name
+        self.trash_path = trash
+        self.user_trash_path = user_trash
+
+    def _check_user(self):
+        euid = os.geteuid()
+        if euid != self.user:
+            raise RuntimeError(f"euid {euid} != trash owner {self.user}")
+
+    def move_to_trash(
+        self,
+        path: str,
+        config: TrashConfig,
+        trash_name: Optional[str] = None,
+        append_timestamp_if_exists: bool = True,
+    ) -> str:
+        self._check_user()
+        assert isinstance(config, TrashConfig), f"invalid trash config {config}"
+
+        dir_fd = None
+        trash_dir_fd = None
+        try:
+            dir_fd, dir_st, filename = self.filesystem.split_path(path)
+            try:
+                st = os.stat(filename, dir_fd=dir_fd, follow_symlinks=False)
+            except OSError as ex:
+                ex.filename = path
+                raise
+
+            if stat.S_ISDIR(st.st_mode):
+                # The user must be the owner of the directory and have rwx permissions.
+                imode = stat.S_IMODE(st.st_mode)
+                if st.st_uid != os.geteuid() or (imode & 0o700) != 0o700:
+                    raise PermissionError(errno.EPERM, os.strerror(errno.EPERM), path)
+
+            trash_dir = os.path.join(self.user_trash_path, config.current_dir())
+            try:
+                os.mkdir(trash_dir, 0o755)
+            except FileExistsError:
+                pass
+
+            trash_dir_fd, trash_dir_st = self.filesystem.opendir(trash_dir)
+
+            trash_name = trash_name or filename
+            current_trash_name = trash_name
+            retry = 0
+            while True:
+                retry += 1
+                try:
+                    self.filesystem._rename_ioctl(
+                        dir_fd,
+                        dir_st.st_ino,
+                        filename,
+                        trash_dir_st.st_ino,
+                        current_trash_name,
+                        True,
+                    )
+                    return os.path.join(trash_dir, current_trash_name)
+                except OSError as ex:
+                    if (
+                        ex.errno in (errno.ENOTDIR, errno.EEXIST, errno.ENOTEMPTY)
+                        and append_timestamp_if_exists
+                        and retry < 10
+                    ):
+                        current_trash_name = f"{trash_name[0:200]}.{get_timestamp_us()}"
+                    else:
+                        raise
+        finally:
+            if dir_fd is not None:
+                os.close(dir_fd)
+            if trash_dir_fd is not None:
+                os.close(trash_dir_fd)
--- a/licenses/argparse/LICENSE
+++ b/licenses/argparse/LICENSE
@@ -0,0 +1,7 @@
+Copyright (c) 2018 Pranav Srinivas Kumar <pranav.srinivas.kumar@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/licenses/arrow/LICENSE.txt
+++ b/licenses/arrow/LICENSE.txt
--- a/licenses/boost/LICENSE_1_0.txt
+++ b/licenses/boost/LICENSE_1_0.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/licenses/foundationdb/LICENSE
+++ b/licenses/foundationdb/LICENSE
@@ -0,0 +1,207 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED WITH FOUNDATIONDB:
+
+The FoundationDB software includes a number of subcomponents with separate 
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------
--- a/licenses/leveldb/LICENSE
+++ b/licenses/leveldb/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/licenses/linenoise/LICENSE
+++ b/licenses/linenoise/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
+Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/licenses/magic_enum/LICENSE
+++ b/licenses/magic_enum/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 - 2024 Daniil Goncharov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/licenses/nameof/LICENSE
+++ b/licenses/nameof/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 - 2024 Daniil Goncharov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/licenses/rapidcsv/LICENSE
+++ b/licenses/rapidcsv/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, Kristofer Berggren
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/licenses/robin-hood-hashing/LICENSE
+++ b/licenses/robin-hood-hashing/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018-2021 Martin Ankerl
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/licenses/smhasher/LICENSE
+++ b/licenses/smhasher/LICENSE
@@ -0,0 +1,23 @@
+All MurmurHash source files are placed in the public domain.
+
+The license below applies to all other code in SMHasher:
+
+Copyright (c) 2011 Google, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/licenses/tomlplusplus/LICENSE
+++ b/licenses/tomlplusplus/LICENSE
@@ -0,0 +1,16 @@
+MIT License
+
+Copyright (c) Mark Gillard <mark.gillard@outlook.com.au>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/licenses/utf8.h/LICENSE
+++ b/licenses/utf8.h/LICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
--- a/patches/apply.sh
+++ b/patches/apply.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")"
+
+if git -C ../third_party/rocksdb apply --reverse --check ../../patches/rocksdb.patch &>/dev/null; then
+    echo "rocksdb patch already applied. skipping."
+else
+    git -C ../third_party/rocksdb apply ../../patches/rocksdb.patch
+fi
+
+if git -C ../third_party/folly apply --reverse --check ../../patches/folly.patch &>/dev/null; then
+    echo "folly patch already applied. skipping."
+else
+    git -C ../third_party/folly apply ../../patches/folly.patch
+fi
--- a/patches/folly.patch
+++ b/patches/folly.patch
--- a/patches/rocksdb.patch
+++ b/patches/rocksdb.patch
@@ -0,0 +1,13 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index e9e506951..79a68507f 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -620,7 +620,7 @@ set(SOURCES
+         db/blob/blob_source.cc
+         db/blob/prefetch_buffer_collection.cc
+         db/builder.cc
+-        db/c.cc
+        # db/c.cc
+         db/column_family.cc
+         db/compaction/compaction.cc
+         db/compaction/compaction_iterator.cc
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,94 @@
+import os
+import re
+import subprocess
+import sys
+
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+#import setuptools_scm as stscm
+import subprocess
+
+#version = stscm.get_version(root=".", relative_to=__file__)
+rev = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').rstrip()
+version = "1.2.9+" + rev
+print('package version', version)
+        
+
+# A CMakeExtension needs a sourcedir instead of a file list.
+# The name must be the _single_ output extension from the CMake build.
+# If you need multiple extensions, see scikit-build.
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=""):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        # required for auto-detection & inclusion of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "RelWithDebInfo"
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
+            f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+            "-DCMAKE_CXX_COMPILER=clang++-14",
+            "-DCMAKE_C_COMPILER=clang-14",
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
+            "-DUSE_RTTI=ON",
+            "-DOVERRIDE_CXX_NEW_DELETE=OFF",
+            "-DSAVE_ALLOCATE_SIZE=OFF",
+            "-DFOLLY_DISABLE_LIBUNWIND=ON",
+        ]
+        build_args = []
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        # # In this example, we pass in the version to C++. You might not need to.
+        #cmake_args += [f"-DPYCLIENT_VERSION_INFO={self.distribution.get_version()}"]
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]
+
+        build_temp = self.build_temp #os.path.join(self.build_temp, ext.name)
+        if not os.path.exists(build_temp):
+            os.makedirs(build_temp)
+
+        subprocess.check_call(["cmake", "-S", ext.sourcedir] + cmake_args, cwd=build_temp)
+        subprocess.check_call(["cmake", "--build", ".", "--target", "hf3fs_py_usrbio"] + build_args, cwd=build_temp)
+
+# The information here can also be placed in setup.cfg - better separation of
+# logic and declaration, and simpler if you include description/version in a file.
+setup(
+    name="hf3fs_py_usrbio",
+    version=version,
+    description="Python binding for hf3fs client library",
+    long_description="",
+    packages=['hf3fs_fuse'],
+    ext_modules=[CMakeExtension("hf3fs_py_usrbio")],
+    cmdclass={"build_ext": CMakeBuild},
+    zip_safe=False,
+    extras_require={"test": ["pytest>=6.0"]},
+    python_requires=">=3.6",
+)
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`target_add_bin(storage_bench "StorageBench.cc" test-fabric-lib storage-client storage memory-common follybenchmark gmock fdb mgmtd)`