Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

2
.cargo/config.toml Normal file
View File

@@ -0,0 +1,2 @@
[build]
rustflags = ["-Clinker=clang++-14", "-Clink-arg=-fuse-ld=lld"]

29
.clang-format Normal file
View File

@@ -0,0 +1,29 @@
BasedOnStyle: Google
Language: Cpp
Standard: c++20
AlignAfterOpenBracket: Align
AlignEscapedNewlines: Left
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BitFieldColonSpacing: Both
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
IndentWidth: 2
PointerAlignment: Right
DerivePointerAlignment: false
ColumnLimit: 120
PackConstructorInitializers: Never
SpaceAfterTemplateKeyword: true
IncludeCategories:
- Regex: 'Flat_generated\.h("|>)'
Priority: 1000
SortPriority: 1000
- Regex: '"fbs/macros/'
Priority: 1001
SortPriority: 1001

117
.clang-tidy Normal file
View File

@@ -0,0 +1,117 @@
Checks: >
-*,
bugprone-undelegated-constructor,
bugprone-argument-comment,
bugprone-bad-signal-to-kill-thread,
bugprone-bool-pointer-implicit-conversion,
bugprone-copy-constructor-init,
bugprone-dangling-handle,
bugprone-forward-declaration-namespace,
bugprone-fold-init-type,
bugprone-inaccurate-erase,
bugprone-incorrect-roundings,
bugprone-infinite-loop,
bugprone-integer-division,
bugprone-macro-repeated-side-effects,
bugprone-misplaced-operator-in-strlen-in-alloc,
bugprone-misplaced-pointer-artithmetic-in-alloc,
bugprone-misplaced-widening-cast,
bugprone-move-forwarding-reference,
bugprone-multiple-statement-macro,
bugprone-parent-virtual-call,
bugprone-posix-return,
bugprone-reserved-identifier,
bugprone-signed-char-misuse,
bugprone-sizeof-container,
bugprone-sizeof-expression,
bugprone-string-constructor,
bugprone-string-integer-assignment,
bugprone-string-literal-with-embedded-nul,
bugprone-suspicious-enum-usage,
bugprone-suspicious-include,
bugprone-suspicious-memset-usage,
bugprone-suspicious-missing-comma,
bugprone-suspicious-string-compare,
bugprone-swapped-arguments,
bugprone-terminating-continue,
bugprone-throw-keyword-missing,
bugprone-too-small-loop-variable,
bugprone-undefined-memory-manipulation,
bugprone-unhandled-self-assignment,
bugprone-unused-raii,
bugprone-unused-return-value,
bugprone-use-after-move,
bugprone-virtual-near-miss,
# bugprone-macro-parentheses,
# bugprone-narrowing-conversions,
# bugprone-exception-escape,
performance-faster-string-find,
performance-for-range-copy,
performance-implicit-conversion-in-loop,
performance-inefficient-algorithm,
performance-inefficient-vector-operation,
performance-move-constructor-init,
performance-no-automatic-move,
performance-trivially-destructible,
performance-unnecessary-copy-initialization,
performance-move-const-arg,
modernize-avoid-bind,
modernize-loop-convert,
modernize-make-shared,
modernize-make-unique,
modernize-raw-string-literal,
modernize-redundant-void-arg,
modernize-replace-auto-ptr,
modernize-replace-random-shuffle,
modernize-use-auto,
modernize-use-bool-literals,
modernize-use-nullptr,
modernize-use-using,
modernize-use-override,
modernize-use-equals-default,
modernize-use-equals-delete,
misc-throw-by-value-catch-by-reference,
misc-misplaced-const,
misc-unconventional-assign-operator,
misc-redundant-expression,
misc-static-assert,
misc-unconventional-assign-operator,
misc-uniqueptr-reset-release,
misc-unused-alias-decls,
misc-unused-parameters,
misc-unused-using-decls,
readability-avoid-const-params-in-decls
readability-const-return-type,
readability-container-size-empty,
readability-delete-null-pointer,
readability-deleted-default,
readability-misplaced-array-index,
readability-non-const-parameter,
readability-redundant-control-flow,
readability-redundant-function-ptr-dereference,
readability-redundant-smartptr-get,
readability-redundant-string-cstr,
readability-redundant-string-init,
readability-static-definition-in-anonymous-namespace,
readability-string-compare,
readability-uniqueptr-delete-release,
readability-simplify-subscript-expr,
readability-simplify-boolean-expr,
readability-inconsistent-declaration-parameter-name,
# readability-qualified-auto,
cert-flp30-c,
cert-mem57-cpp,
cert-oop58-cpp,
google-build-explicit-make-pair,
google-runtime-operator,
hicpp-exception-baseclass,
cppcoreguidelines-virtual-class-destructor,
WarningsAsErrors: ""
CheckOptions:
- key: performance-move-const-arg.CheckTriviallyCopyableMove
value: false

3
.clangd Normal file
View File

@@ -0,0 +1,3 @@
CompileFlags:
Add: -fcoroutines-ts
Remove: [-fcoroutines, -DNO_INTELLISENSE]

3
.dockerignore Normal file
View File

@@ -0,0 +1,3 @@
clang
build
deploy/*/Dockerfile

36
.github/workflows/build.yml vendored Normal file
View File

@@ -0,0 +1,36 @@
name: Build
on:
push:
branches: [ "main" ]
jobs:
build:
runs-on: self-hosted # or `ubuntu-22.04`
steps:
- uses: actions/checkout@v4
- name: Configure sccache-cache
run: |
echo "RUSTC_WRAPPER=sccache" >> $GITHUB_ENV
echo "SCCACHE_GHA_ENABLED=true" >> $GITHUB_ENV
- name: Run sccache-cache
uses: mozilla-actions/sccache-action@v0.0.4
- name: Prepare
run: |
sudo apt install -y cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev
sudo apt install -y libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14
sudo apt install -y libgoogle-perftools-dev google-perftools libssl-dev ccache gcc-12 g++-12 libboost-all-dev meson rustc cargo
wget https://github.com/apple/foundationdb/releases/download/7.1.61/foundationdb-clients_7.1.61-1_amd64.deb && sudo dpkg -i foundationdb-clients_7.1.61-1_amd64.deb
git clone https://github.com/libfuse/libfuse.git libfuse -b fuse-3.16.2 --depth 1 && mkdir libfuse/build && cd libfuse/build && meson setup .. && ninja && sudo ninja install && cd ../.. && rm -rf libfuse
git submodule update --init --recursive
./patches/apply.sh
- name: Build
run: |
cargo build --release
cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_C_COMPILER=clang-14 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cmake --build build -j 32

17
.gitignore vendored Normal file
View File

@@ -0,0 +1,17 @@
/.cache
/.vscode
/build
/debug
/release
/clang
/clangdbg
/packages
*.o
*.a
*.so
*.log
compile_commands.json
*_generated.h
*~
__pycache__
/target

42
.gitmodules vendored Normal file
View File

@@ -0,0 +1,42 @@
[submodule "third_party/googletest"]
path = third_party/googletest
url = https://github.com/google/googletest.git
[submodule "third_party/folly"]
path = third_party/folly
url = https://github.com/facebook/folly.git
[submodule "third_party/leveldb"]
path = third_party/leveldb
url = https://github.com/google/leveldb.git
[submodule "third_party/rocksdb"]
path = third_party/rocksdb
url = https://github.com/facebook/rocksdb.git
[submodule "third_party/scnlib"]
path = third_party/scnlib
url = https://github.com/eliaskosunen/scnlib.git
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/clickhouse-cpp"]
path = third_party/clickhouse-cpp
url = https://github.com/ClickHouse/clickhouse-cpp.git
[submodule "third_party/fmt"]
path = third_party/fmt
url = https://github.com/fmtlib/fmt.git
[submodule "third_party/toml11"]
path = third_party/toml11
url = https://github.com/ToruNiina/toml11.git
[submodule "third_party/jemalloc"]
path = third_party/jemalloc
url = https://github.com/jemalloc/jemalloc.git
[submodule "third_party/mimalloc"]
path = third_party/mimalloc
url = https://github.com/microsoft/mimalloc.git
[submodule "third_party/zstd"]
path = third_party/zstd
url = https://github.com/facebook/zstd.git
[submodule "third_party/liburing"]
path = third_party/liburing
url = https://github.com/axboe/liburing.git
[submodule "third_party/gtest-parallel"]
path = third_party/gtest-parallel
url = https://github.com/google/gtest-parallel.git

172
CMakeLists.txt Normal file
View File

@@ -0,0 +1,172 @@
cmake_minimum_required(VERSION 3.12)
project(3FS VERSION 0.1.5 LANGUAGES C CXX)
set(CMAKE_CONFIGURATION_TYPES "RelWithDebInfo;Debug;Release;MinSizeRel" CACHE STRING "" FORCE)
if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None")
set (CMAKE_BUILD_TYPE "RelWithDebInfo")
message (STATUS "CMAKE_BUILD_TYPE is not set, set to default = ${CMAKE_BUILD_TYPE}")
endif ()
message (STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
if(CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" )
option(ENABLE_ASSERTIONS "Enable assertions" ON)
else()
option(ENABLE_ASSERTIONS "Enable assertions" OFF)
endif()
message (STATUS "ENABLE_ASSERTIONS: ${ENABLE_ASSERTIONS}")
if(ENABLE_ASSERTIONS)
add_definitions(-D_DEBUG)
# On non-Debug builds cmake automatically defines NDEBUG, so we explicitly undefine it:
if(NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
# NOTE: use `add_compile_options` rather than `add_definitions` since
# `add_definitions` does not support generator expressions.
add_compile_options($<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:-UNDEBUG>)
endif()
endif()
option(OVERRIDE_CXX_NEW_DELETE "Override C++ new/delete operator" OFF)
option(SAVE_ALLOCATE_SIZE "Use more memory to save allocate size" OFF)
option(ENABLE_FUSE_APPLICATION "" ON)
if (DEFINED SANITIZER AND SANITIZER)
set(OVERRIDE_CXX_NEW_DELETE OFF)
endif()
message (STATUS "OVERRIDE_CXX_NEW_DELETE: ${OVERRIDE_CXX_NEW_DELETE}")
if (OVERRIDE_CXX_NEW_DELETE)
add_definitions(-DOVERRIDE_CXX_NEW_DELETE)
if (SAVE_ALLOCATE_SIZE)
add_definitions(-DSAVE_ALLOCATE_SIZE)
endif()
endif()
message (STATUS "SAVE_ALLOCATE_SIZE: ${SAVE_ALLOCATE_SIZE}")
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED OFF)
set(CMAKE_C_EXTENSIONS ON)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcoroutines-ts")
set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -latomic")
add_link_options(-fuse-ld=lld)
# Do not build with libc++ (LLVM's implementation of the C++ standard library) in fdb
set(USE_LIBCXX OFF)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcoroutines")
endif()
# Remove project root from the __FILE__ macro variable
add_compile_options(-fmacro-prefix-map=${CMAKE_SOURCE_DIR}=.)
add_compile_options(-msse4.2 -mavx2)
add_compile_definitions(ROCKSDB_NAMESPACE=rocksdb_internal)
include(cmake/Sanitizers.cmake)
include(cmake/CompileFlags.cmake)
# folly can't work normally under -Werror
store_compile_flags()
add_subdirectory("third_party/fmt" EXCLUDE_FROM_ALL)
restore_compile_flags()
set(ZSTD_BUILD_STATIC ON)
add_subdirectory("third_party/zstd/build/cmake" EXCLUDE_FROM_ALL)
set(zstd_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/third_party/zstd/lib")
set(ZSTD_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/third_party/zstd/lib")
set(zstd_FOUND ON)
set(ZSTD_FOUND ON)
set(zstd_LIBRARIES "${PROJECT_BINARY_DIR}/third_party/zstd/build/cmake/lib/libzstd.a")
set(ZSTD_LIBRARY "${PROJECT_BINARY_DIR}/third_party/zstd/build/cmake/lib/libzstd.a")
restore_compile_flags()
add_subdirectory("third_party/googletest" EXCLUDE_FROM_ALL)
restore_compile_flags()
set(FOLLY_NO_EXCEPTION_TRACER ON)
add_subdirectory("third_party/folly" EXCLUDE_FROM_ALL)
restore_compile_flags()
set(LEVELDB_BUILD_TESTS OFF CACHE BOOL "Disable LevelDB tests")
set(LEVELDB_BUILD_BENCHMARKS OFF CACHE BOOL "Disable LevelDB benchmarks")
set(LEVELDB_INSTALL OFF CACHE BOOL "Disable LevelDB install")
add_subdirectory("third_party/leveldb" EXCLUDE_FROM_ALL)
restore_compile_flags()
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
set(WITH_LZ4 ON)
set(WITH_ZSTD ON)
set(USE_RTTI ON)
set(WITH_TESTS OFF)
set(WITH_BENCHMARK_TOOLS OFF)
set(WITH_TOOLS OFF)
set(WITH_ALL_TESTS OFF)
add_subdirectory("third_party/rocksdb" EXCLUDE_FROM_ALL)
restore_compile_flags()
set(SCN_TESTS OFF)
set(SCN_EXAMPLES OFF)
set(SCN_BENCHMARKS OFF)
set(SCN_DOCS OFF)
set(SCN_INSTALL OFF)
set(SCN_PEDANTIC OFF)
add_subdirectory("third_party/scnlib" EXCLUDE_FROM_ALL)
restore_compile_flags()
add_subdirectory("third_party/pybind11" EXCLUDE_FROM_ALL)
restore_compile_flags()
add_subdirectory("third_party/toml11" EXCLUDE_FROM_ALL)
restore_compile_flags()
set (MI_OVERRIDE OFF)
add_subdirectory("third_party/mimalloc" EXCLUDE_FROM_ALL)
restore_compile_flags()
add_subdirectory("third_party/clickhouse-cpp" EXCLUDE_FROM_ALL)
TARGET_INCLUDE_DIRECTORIES(clickhouse-cpp-lib
PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp
)
TARGET_INCLUDE_DIRECTORIES(clickhouse-cpp-lib-static
PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp
)
TARGET_INCLUDE_DIRECTORIES (absl-lib
PUBLIC ${PROJECT_SOURCE_DIR}/third_party/clickhouse-cpp/contrib
)
restore_compile_flags()
add_subdirectory("third_party/liburing-cmake" EXCLUDE_FROM_ALL)
restore_compile_flags()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wpedantic")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS}")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_RELWITHDEBINFO "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
set (CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} -O0 ${DEBUG_INFO_FLAGS} -fno-inline ${CMAKE_ASM_FLAGS_ADD}")
set(Boost_USE_STATIC_LIBS ON)
set(FDB_VERSION 7.1.5-ibe)
find_package(Threads REQUIRED)
find_package(Boost REQUIRED COMPONENTS filesystem system program_options)
find_library(LIBUV_LIBRARY NAMES libuv1)
enable_testing()
include(cmake/CodeCoverage.cmake)
include(cmake/CLangFormat.cmake)
include(cmake/CLangTidy.cmake)
include(cmake/Target.cmake)
include(cmake/DumpConfig.cmake)
include(cmake/Jemalloc.cmake)
include(cmake/ApacheArrow.cmake)
include(cmake/AddCrate.cmake)
configure_file(cmake/CTestCustom.cmake ${CMAKE_BINARY_DIR} @ONLY)
add_subdirectory(src)
add_subdirectory(tests)
add_subdirectory(benchmarks)

1968
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

16
Cargo.toml Normal file
View File

@@ -0,0 +1,16 @@
[workspace]
members = [
"src/client/trash_cleaner",
"src/storage/chunk_engine"
]
resolver = "2"
[workspace.package]
authors = ["dev <noreply@deepseek.com>"]
edition = "2021"
license = "MIT"
[profile.release-cmake]
debug = true
inherits = "release"
lto = true

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 DeepSeek
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

101
README.md Normal file
View File

@@ -0,0 +1,101 @@
# Fire-Flyer File System
[![Build](https://github.com/deepseek-ai/3fs/actions/workflows/build.yml/badge.svg)](https://github.com/deepseek-ai/3fs/actions/workflows/build.yml)
[![License](https://img.shields.io/badge/LICENSE-MIT-blue.svg)](LICENSE)
The Fire-Flyer File System (3FS) is a high-performance distributed file system designed to address the challenges of AI training and inference workloads. It leverages modern SSDs and RDMA networks to provide a shared storage layer that simplifies development of distributed applications. Key features and benefits of 3FS include:
- Performance and Usability
- **Disaggregated Architecture** Combines the throughput of thousands of SSDs and the network bandwidth of hundreds of storage nodes, enabling applications to access storage resource in a locality-oblivious manner.
- **Strong Consistency** Implements Chain Replication with Apportioned Queries (CRAQ) for strong consistency, making application code simple and easy to reason about.
- **File Interfaces** Develops stateless metadata services backed by a transactional key-value store (e.g., FoundationDB). The file interface is well known and used everywhere. There is no need to learn a new storage API.
- Diverse Workloads
- **Data Preparation** Organizes outputs of data analytics pipelines into hierarchical directory structures and manages large volume of intermediate outputs efficiently.
- **Dataloaders** Eliminates the need for prefetching or shuffling datasets by enabling random access to training samples across compute nodes.
- **Checkpointing** Supports high-throughput parallel checkpointing for large-scale training.
- **KVCache for Inference** Provides a cost-effective alternative to DRAM-based caching, offering high throughput and significantly larger capacity.
## Documentation
* [Design Notes](docs/design_notes.md)
* [Setup Guide](deploy/README.md)
* [USRBIO API Reference](src/lib/api/UsrbIo.md)
* [P Specifications](./specs/README.md)
## Performance
### 1. Peak throughput
The following figure demonstrates the throughput of read stress test on a large 3FS cluster. This cluster consists of 180 storage nodes, each equipped with 2×200Gbps InfiniBand NICs and sixteen 14TiB NVMe SSDs. Approximately 500+ client nodes were used for the read stress test, with each client node configured with 1x200Gbps InfiniBand NIC. The final aggregate read throughput reached approximately 6.6 TiB/s with background traffic from training jobs.
![Large block read throughput under stress test on a 180-node cluster](docs/images/peak_throughput.jpg)
### 2. GraySort
We evaluated [smallpond](https://github.com/deepseek-ai/smallpond) using the GraySort benchmark, which measures sort performance on large-scale datasets. Our implementation adopts a two-phase approach: (1) partitioning data via shuffle using the prefix bits of keys, and (2) in-partition sorting. Both phases read/write data from/to 3FS.
The test cluster comprised 25 storage nodes (2 NUMA domains/node, 1 storage service/NUMA, 2×400Gbps NICs/node) and 50 compute nodes (2 NUMA domains, 192 physical cores, 2.2 TiB RAM, and 1×200 Gbps NIC/node). Sorting 110.5 TiB of data across 8,192 partitions completed in 30 minutes and 14 seconds, achieving an average throughput of *3.66 TiB/min*.
![](docs/images/gray_sort_server.png)
![](docs/images/gray_sort_client.png)
### 3. KVCache
KVCache is a technique used to optimize the LLM inference process. It avoids redundant computations by caching the key and value vectors of previous tokens in the decoder layers.
The top figure demonstrates the read throughput of all KVCache clients, highlighting both peak and average values, with peak throughput reaching up to 40 GiB/s. The bottom figure presents the IOPS of remove ops from garbage collection (GC) during the same time period.
![KVCache Read Throughput](./docs/images/kvcache_read_throughput.png)
![KVCache GC IOPS](./docs/images/kvcache_gc_iops.png)
## Check out source code
Clone 3FS repository from github:
git clone https://github.com/deepseek-ai/3fs
When `deepseek-ai/3fs` has been cloned to local file system, run the
following commands to check out the submodules:
```bash
cd 3fs
git submodule update --init --recursive
./patches/apply.sh
```
## Install dependencies
Install dependencies:
```bash
# for Ubuntu 20.04.
apt install cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev \
libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14 \
libgoogle-perftools-dev google-perftools libssl-dev ccache libclang-rt-14-dev gcc-10 g++-10 libboost1.71-all-dev
# for Ubuntu 22.04.
apt install cmake libuv1-dev liblz4-dev liblzma-dev libdouble-conversion-dev libprocps-dev libdwarf-dev libunwind-dev \
libaio-dev libgflags-dev libgoogle-glog-dev libgtest-dev libgmock-dev clang-format-14 clang-14 clang-tidy-14 lld-14 \
libgoogle-perftools-dev google-perftools libssl-dev ccache gcc-12 g++-12 libboost-all-dev
```
Install other build prerequisites:
- [`libfuse`](https://github.com/libfuse/libfuse/releases/tag/fuse-3.16.1) 3.16.1 or newer version
- [FoundationDB](https://apple.github.io/foundationdb/getting-started-linux.html) 7.1 or newer version
- [Rust](https://www.rust-lang.org/tools/install) toolchain
## Build 3FS
Build 3FS in `build` folder:
cmake -S . -B build -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_C_COMPILER=clang-14 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cmake --build build -j 32
## Run a test cluster
Follow instructions in [setup guide](deploy/README.md) to run a test cluster.
## Report Issues
Please visit https://github.com/deepseek-ai/3fs/issues to report issues.

View File

@@ -0,0 +1 @@
add_subdirectory(storage_bench)

View File

@@ -0,0 +1 @@
target_add_bin(storage_bench "StorageBench.cc" test-fabric-lib storage-client storage memory-common follybenchmark gmock fdb mgmtd)

View File

@@ -0,0 +1,291 @@
#include "StorageBench.h"
#include <folly/init/Init.h>
#include "common/monitor/Monitor.h"
#include "memory/common/OverrideCppNewDelete.h"
DEFINE_bool(benchmarkNetwork, false, "Run in network benchmark mode");
DEFINE_bool(benchmarkStorage, false, "Run in storage benchmark mode");
DEFINE_bool(ignoreIOError, false, "Ignore all IO errors");
DEFINE_bool(injectRandomServerError, false, "Inject random server errors");
DEFINE_bool(injectRandomClientError, false, "Inject random client errors");
DEFINE_bool(retryPermanentError, false, "Retry requests with permanent errors");
DEFINE_bool(verifyReadData, false, "Check if the read data are correct");
DEFINE_bool(verifyReadChecksum, false, "Verify the checksum of read IOs");
DEFINE_bool(verifyWriteChecksum, true, "Verify the checksum of write IOs");
DEFINE_bool(randomShuffleChunkIds, false, "Random shuffle generated chunk IDs");
DEFINE_bool(generateTestData, true, "Generate test data for read test");
DEFINE_bool(sparseChunkIds, false, "Generate sparse chunk IDs");
DEFINE_bool(truncateChunks, false, "Truncate chunks");
DEFINE_bool(cleanupChunks, false, "Clean up (remove) chunks after benchmark");
DEFINE_bool(cleanupChunksBeforeBench, false, "Clean up (remove) chunks before benchmark");
DEFINE_bool(serverMode, false, "Run in server mode");
DEFINE_bool(clientMode, false, "Run in client mode");
DEFINE_bool(clusterMode, false, "Run in cluster mode (get routing info from mgmtd)");
DEFINE_bool(printMetrics, false, "Enable printing metrics in logs");
DEFINE_bool(reportMetrics, false, "Enable reporting metrics to ClickHouse");
DEFINE_uint32(metaStoreType, 0, "Metadata store type (0=LevelDB, 1=RocksDB, 2=MemDB)");
DEFINE_uint32(chunkSizeKB, 512, "Chunk size (KB)");
DEFINE_uint32(chainTableId, 0, "Chain table id");
DEFINE_uint32(chainTableVersion, 0, "Chain table version");
DEFINE_string(chainIds, "", "List of chain ids");
DEFINE_string(storageNodeIds, "", "List of storage node ids");
DEFINE_uint32(numChains, 1, "Number of chains");
DEFINE_uint32(numReplicas, 1, "Number of replicas");
DEFINE_uint32(numStorageNodes, 1, "Number of storage nodes");
DEFINE_uint32(numChunks, 1, "Number of chunks");
DEFINE_uint32(readSize, 4096, "Read IO size");
DEFINE_uint32(writeSize, 131072, "Write IO size");
DEFINE_uint32(memoryAlignment, 1, "Alignment size of each IO buffer");
DEFINE_uint32(readOffAlignment, 0, "Alignment size of each read IO offset");
DEFINE_uint32(batchSize, 1, "Read/write batch size");
DEFINE_uint32(readBatchSize, 0, "Read batch size");
DEFINE_uint32(writeBatchSize, 0, "Write batch size");
DEFINE_uint32(removeBatchSize, 0, "Remove batch size");
DEFINE_uint32(numReadSecs, 0, "Read test duration");
DEFINE_uint32(numWriteSecs, 0, "Write test duration");
DEFINE_uint32(numCoroutines, 1, "Number of coroutines");
DEFINE_uint32(numTestThreads, 1, "Number of test threads");
DEFINE_uint32(randSeed, 0, "Random seed for chunk id generation");
DEFINE_uint32(chunkIdPrefix, 0xFFFF, "The most significant 2 bytes of chunk ids");
DEFINE_uint32(serviceLevel, 0, "Service level");
DEFINE_uint32(listenPort, 0, "Listen port");
DEFINE_uint32(clientTimeoutMS, 30000, "Client timeout (milliseconds)");
DEFINE_string(dataPaths, folly::fs::temp_directory_path().string(), "Comma or space separated list of paths");
DEFINE_string(clientConfig, "", "Path of client config");
DEFINE_string(serverConfig, "", "Path of server config");
DEFINE_string(statsFilePath, "./perfstats.csv", "Path of performance stats file");
DEFINE_string(ibvDevices, "mlx5_0,mlx5_1", "Comma or space separated list of ibv devices");
DEFINE_string(ibnetZones, "", "Comma or space separated list of IB network zones");
DEFINE_string(clusterId, "stage", "Cluster id used to connect to mgmtd");
DEFINE_string(mgmtdEndpoints,
"",
"Comma or space separated list of mgmtd endpoints, "
"e.g. 'RDMA://10.1.1.1:1234,RDMA://10.1.1.2:1234'");
DEFINE_string(storageEndpoints,
"",
"Comma or space separated list of storage ids and endpoints, "
"e.g. '1@RDMA://10.1.1.1:1234,2@RDMA://10.1.1.2:1234'");
DEFINE_string(monitorEndpoint, "", "Monitor endpoint");
DEFINE_uint32(defaultPKeyIndex, 1, "IB default pkey index");
namespace hf3fs::storage::benchmark {
using namespace hf3fs::storage::client;
std::vector<uint32_t> stringToIntVec(const std::string &str) {
std::vector<uint32_t> vec;
std::vector<std::string> substrs;
boost::split(substrs, str, boost::is_any_of(", "));
for (auto s : substrs) {
boost::trim(s);
if (s.empty()) continue;
uint32_t n = std::stoul(s);
vec.push_back(n);
}
return vec;
}
bool runBenchmarks() {
std::vector<std::string> dataPathStrs;
boost::split(dataPathStrs, FLAGS_dataPaths, boost::is_any_of(", "));
std::vector<hf3fs::Path> dataPaths;
dataPaths.reserve(dataPathStrs.size());
for (auto str : dataPathStrs) {
boost::trim(str);
if (str.empty()) continue;
dataPaths.emplace_back(str);
}
std::vector<std::string> endpointRawStrs;
boost::split(endpointRawStrs, FLAGS_storageEndpoints, boost::is_any_of(", "));
std::map<NodeId, net::Address> storageEndpoints;
for (auto str : endpointRawStrs) {
boost::trim(str);
if (str.empty()) continue;
std::vector<std::string> nodeEndpointStrs;
boost::split(nodeEndpointStrs, str, boost::is_any_of("@"));
if (nodeEndpointStrs.size() != 2) {
XLOGF(ERR, "Wrong node endpoint string: {}", str);
return false;
}
auto nodeIdStr = nodeEndpointStrs[0];
auto endpointStr = nodeEndpointStrs[1];
NodeId nodeId{std::stoul(nodeIdStr)};
auto endpoint = net::Address::fromString(endpointStr);
storageEndpoints[nodeId] = endpoint;
XLOGF(WARN, "Add storage endpoint: {} @ {}", nodeId, endpoint);
}
if (FLAGS_clientMode && storageEndpoints.empty()) {
XLOGF(ERR, "No storage endpoint specified for client mode");
return false;
}
if (FLAGS_readSize > FLAGS_chunkSizeKB * 1024) {
XLOGF(ERR, "Read size {} is greater than chunk size {}", FLAGS_readSize, FLAGS_chunkSizeKB * 1024);
return false;
}
auto metaStoreType = static_cast<kv::KVStore::Type>(FLAGS_metaStoreType);
test::SystemSetupConfig setupConfig = {
FLAGS_chunkSizeKB * 1024 /*chunkSize*/,
FLAGS_numChains /*numChains*/,
FLAGS_numReplicas /*numReplicas*/,
FLAGS_numStorageNodes /*numStorageNodes*/,
dataPaths /*dataPaths*/,
FLAGS_clientConfig,
FLAGS_serverConfig,
storageEndpoints,
FLAGS_serviceLevel,
FLAGS_listenPort,
StorageClient::ImplementationType::RPC /*clientImplType*/,
metaStoreType,
true /*useFakeMgmtdClient*/,
!FLAGS_clientMode /*startStorageServer*/,
false,
};
std::vector<std::string> ibvDevices;
boost::split(ibvDevices, FLAGS_ibvDevices, boost::is_any_of(", "));
std::vector<std::string> ibnetZones;
boost::split(ibnetZones, FLAGS_ibnetZones, boost::is_any_of(", "));
endpointRawStrs.clear();
boost::split(endpointRawStrs, FLAGS_mgmtdEndpoints, boost::is_any_of(", "));
std::vector<net::Address> mgmtdEndpoints;
for (auto str : endpointRawStrs) {
boost::trim(str);
if (str.empty()) continue;
auto endpoint = net::Address::fromString(str);
mgmtdEndpoints.push_back(endpoint);
XLOGF(WARN, "Add mgmtd endpoint: {}", endpoint);
}
StorageBench::Options benchOptions{FLAGS_numChunks,
FLAGS_readSize,
FLAGS_writeSize,
FLAGS_batchSize,
FLAGS_numReadSecs,
FLAGS_numWriteSecs,
FLAGS_clientTimeoutMS,
FLAGS_numCoroutines,
FLAGS_numTestThreads,
FLAGS_randSeed,
(uint16_t)FLAGS_chunkIdPrefix,
FLAGS_benchmarkNetwork,
FLAGS_benchmarkStorage,
FLAGS_ignoreIOError,
FLAGS_injectRandomServerError,
FLAGS_injectRandomClientError,
FLAGS_retryPermanentError,
FLAGS_verifyReadData,
FLAGS_verifyReadChecksum,
FLAGS_verifyWriteChecksum,
FLAGS_randomShuffleChunkIds,
FLAGS_generateTestData,
FLAGS_sparseChunkIds,
FLAGS_statsFilePath,
ibvDevices,
ibnetZones,
mgmtdEndpoints,
FLAGS_clusterId,
FLAGS_chainTableId,
FLAGS_chainTableVersion,
stringToIntVec(FLAGS_chainIds),
stringToIntVec(FLAGS_storageNodeIds),
FLAGS_memoryAlignment,
FLAGS_readOffAlignment,
FLAGS_defaultPKeyIndex,
FLAGS_readBatchSize,
FLAGS_writeBatchSize,
FLAGS_removeBatchSize};
StorageBench bench(setupConfig, benchOptions);
if (FLAGS_clusterMode) {
if (!bench.connect()) {
XLOGF(WARN, "Failed to connect to cluster");
return false;
}
} else {
if (!bench.setup()) {
XLOGF(WARN, "Failed to set up benchmark");
return false;
}
}
bench.generateChunkIds();
if (FLAGS_cleanupChunksBeforeBench) {
bench.cleanup();
}
bool runOK = true;
if (FLAGS_serverMode) {
XLOGF(WARN, "Waiting...");
while (true) {
::sleep(1);
}
} else {
runOK = bench.run();
}
if (FLAGS_truncateChunks) {
bench.truncate();
}
if (FLAGS_cleanupChunks) {
bench.cleanup();
}
bench.teardown();
return runOK;
}
} // namespace hf3fs::storage::benchmark
int main(int argc, char **argv) {
folly::init(&argc, &argv, true);
hf3fs::monitor::Monitor::Config monitorConfig;
if (FLAGS_printMetrics || FLAGS_reportMetrics) {
if (FLAGS_printMetrics) {
monitorConfig.reporters(0).set_type("log");
} else if (FLAGS_reportMetrics) {
monitorConfig.reporters(0).set_type("monitor_collector");
monitorConfig.reporters(0).monitor_collector().set_remote_ip(FLAGS_monitorEndpoint);
monitorConfig.set_reporters_length(1);
}
auto monitorResult = hf3fs::monitor::Monitor::start(monitorConfig);
XLOGF_IF(FATAL, !monitorResult, "Failed to start monitor: {}", monitorResult.error());
}
bool ok = hf3fs::storage::benchmark::runBenchmarks();
hf3fs::monitor::Monitor::stop();
hf3fs::memory::shutdown();
return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}

View File

@@ -0,0 +1,895 @@
#pragma once
#include <boost/algorithm/string.hpp>
#include <boost/core/ignore_unused.hpp>
#include <common/utils/UtcTime.h>
#include <folly/experimental/coro/Collect.h>
#include <folly/futures/Barrier.h>
#include <folly/stats/TDigest.h>
#include <numeric>
#include <optional>
#include <random>
#include <vector>
#include "common/logging/LogInit.h"
#include "common/net/ib/IBDevice.h"
#include "common/utils/Duration.h"
#include "common/utils/SysResource.h"
#include "tests/lib/UnitTestFabric.h"
namespace hf3fs::storage::benchmark {
using namespace hf3fs::storage::client;
class StorageBench : public test::UnitTestFabric {
public:
struct Options {
const size_t numChunks;
const size_t readSize;
const size_t writeSize;
const size_t batchSize;
const uint64_t numReadSecs;
const uint64_t numWriteSecs;
const uint64_t clientTimeoutMS;
const size_t numCoroutines;
const size_t numTestThreads;
const uint32_t randSeed = 0;
const uint16_t chunkIdPrefix = 0xFFFF;
const bool benchmarkNetwork = false;
const bool benchmarkStorage = false;
const bool ignoreIOError = false;
const bool injectRandomServerError = false;
const bool injectRandomClientError = false;
const bool retryPermanentError = false;
const bool verifyReadData = false;
const bool verifyReadChecksum = false;
const bool verifyWriteChecksum = true;
const bool randomShuffleChunkIds = true;
const bool generateTestData = true;
const bool sparseChunkIds = true;
const std::string statsFilePath = "./perfstats.csv";
const std::vector<std::string> ibvDevices = {};
const std::vector<std::string> ibnetZones = {};
const std::vector<net::Address> mgmtdEndpoints = {};
const std::string clusterId = kClusterId;
const uint32_t chainTableId = 0;
const uint32_t chainTableVersion = 0;
const std::vector<uint32_t> chainIds = {};
const std::vector<uint32_t> storageNodeIds = {};
const size_t memoryAlignment = 1;
const size_t readOffAlignment = 0;
const size_t defaultPKeyIndex = 1;
size_t readBatchSize = 0;
size_t writeBatchSize = 0;
size_t removeBatchSize = 0;
};
private:
static constexpr uint32_t kTDigestMaxSize = 1000;
struct ChunkInfo {
ChainId chainId;
ChunkId chunkId;
size_t size;
};
Options benchOptions_;
std::vector<folly::TDigest> writeLatencyDigests_;
std::vector<folly::TDigest> readLatencyDigests_;
folly::CPUThreadPoolExecutor testExecutor_;
std::atomic_uint64_t numWriteBytes_;
std::atomic_uint64_t numReadBytes_;
folly::Random::DefaultGenerator randGen_;
std::vector<std::vector<ChunkInfo>> chunkInfos_;
std::vector<size_t> numCreatedChunks_;
size_t totalNumChunks_;
double totalChunkGiB_;
public:
StorageBench(const test::SystemSetupConfig &setupConfig, const Options &options)
: UnitTestFabric(setupConfig),
benchOptions_(options),
writeLatencyDigests_(benchOptions_.numCoroutines, folly::TDigest(kTDigestMaxSize)),
readLatencyDigests_(benchOptions_.numCoroutines, folly::TDigest(kTDigestMaxSize)),
testExecutor_(benchOptions_.numTestThreads),
numWriteBytes_(0),
numReadBytes_(0),
randGen_(folly::Random::create()),
chunkInfos_(benchOptions_.numCoroutines),
numCreatedChunks_(benchOptions_.numCoroutines) {
if (benchOptions_.readBatchSize == 0) benchOptions_.readBatchSize = benchOptions_.batchSize;
if (benchOptions_.writeBatchSize == 0) benchOptions_.writeBatchSize = benchOptions_.batchSize;
if (benchOptions_.removeBatchSize == 0) benchOptions_.removeBatchSize = benchOptions_.batchSize;
}
void generateChunkIds() {
static_assert(sizeof(benchOptions_.chunkIdPrefix) == 2);
uint64_t chunkIdPrefix64 = ((uint64_t)benchOptions_.chunkIdPrefix) << (UINT64_WIDTH - UINT16_WIDTH);
std::sort(chainIds_.begin(), chainIds_.end());
static thread_local std::mt19937 generator;
randGen_.seed(benchOptions_.randSeed);
XLOGF(WARN,
"Generating {} chunk ids with prefix {:08X} and random seed {}...",
totalNumChunks_,
chunkIdPrefix64,
benchOptions_.randSeed);
for (auto &chunkInfos : chunkInfos_) {
uint64_t instancePrefix = chunkIdPrefix64 | folly::Random::rand64(randGen_);
XLOGF(DBG3, "Random chunk id prefix {:08X}", instancePrefix);
chunkInfos.reserve(chainIds_.size() * benchOptions_.numChunks);
for (auto chainId : chainIds_) {
for (size_t chunkIndex = 0; chunkIndex < benchOptions_.numChunks; chunkIndex++) {
if (benchOptions_.sparseChunkIds) {
uint64_t chunkIdHigh = chunkIdPrefix64 | (folly::Random::rand64(randGen_) & 0x000000FFFFFFFFFF);
uint64_t chunkIdLow = (folly::Random::rand64(randGen_) << UINT32_WIDTH) + chunkIndex;
chunkInfos.push_back({chainId, ChunkId(chunkIdHigh, chunkIdLow), 0});
} else {
chunkInfos.push_back({chainId, ChunkId(instancePrefix, chunkIndex), 0});
}
}
}
if (benchOptions_.randomShuffleChunkIds) std::shuffle(chunkInfos.begin(), chunkInfos.end(), generator);
}
}
bool connect() {
XLOGF(INFO, "Start to connect...");
if (!setupIBSock()) {
return false;
}
mgmtdClientConfig_.set_mgmtd_server_addresses(benchOptions_.mgmtdEndpoints);
mgmtdClientConfig_.set_enable_auto_refresh(true);
mgmtdClientConfig_.set_enable_auto_heartbeat(false);
mgmtdClientConfig_.set_enable_auto_extend_client_session(true);
mgmtdClientConfig_.set_auto_refresh_interval(3_s);
mgmtdClientConfig_.set_auto_heartbeat_interval(3_s);
mgmtdClientConfig_.set_auto_extend_client_session_interval(3_s);
mgmtdClientConfig_.set_accept_incomplete_routing_info_during_mgmtd_bootstrapping(false);
if (!client_.start()) {
XLOGF(ERR, "Failed to start net client for mgmtd client");
return false;
}
XLOGF(INFO, "Creating mgmtd client...");
auto stubFactory = std::make_unique<hf3fs::stubs::RealStubFactory<hf3fs::mgmtd::MgmtdServiceStub>>(
stubs::ClientContextCreator{[this](net::Address addr) { return client_.serdeCtx(addr); }});
auto mgmtdClient = std::make_unique<hf3fs::client::MgmtdClientForClient>(benchOptions_.clusterId,
std::move(stubFactory),
mgmtdClientConfig_);
auto physicalHostnameRes = SysResource::hostname(/*physicalMachineName=*/true);
if (!physicalHostnameRes) {
XLOGF(ERR, "getHostname(true) failed: {}", physicalHostnameRes.error());
return false;
}
auto containerHostnameRes = SysResource::hostname(/*physicalMachineName=*/false);
if (!containerHostnameRes) {
XLOGF(ERR, "getHostname(false) failed: {}", containerHostnameRes.error());
return false;
}
mgmtdClient->setClientSessionPayload({clientId_.uuid.toHexString(),
flat::NodeType::CLIENT,
flat::ClientSessionData::create(
/*universalId=*/*physicalHostnameRes,
/*description=*/fmt::format("StorageBench: {}", *containerHostnameRes),
/*serviceGroups=*/std::vector<flat::ServiceGroupInfo>{},
flat::ReleaseVersion::fromVersionInfo()),
flat::UserInfo{}});
folly::coro::blockingWait(mgmtdClient->start(&client_.tpg().bgThreadPool().randomPick()));
mgmtdForClient_ = std::move(mgmtdClient);
// get routing info
for (size_t retry = 0; retry < 15; retry++) {
auto routingInfo = mgmtdForClient_->getRoutingInfo();
if (routingInfo == nullptr || routingInfo->raw()->chains.empty()) {
XLOGF(WARN, "Empty routing info, #{} retry...", retry + 1);
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
} else {
for (const auto &[tableId, tableVersions] : routingInfo->raw()->chainTables) {
if (tableId == benchOptions_.chainTableId) {
if (tableVersions.empty()) {
XLOGF(WARN, "No version found for chain table with id {}", tableId);
return false;
}
XLOGF(INFO, "Found {} version(s) of chain table {}", tableVersions.size(), benchOptions_.chainTableId);
flat::ChainTable chainTable;
if (benchOptions_.chainTableVersion > 0) {
flat::ChainTableVersion tableVersion(benchOptions_.chainTableVersion);
auto tableIter = tableVersions.find(tableVersion);
if (tableIter == tableVersions.end()) {
XLOGF(WARN, "Version {} not found in chain table with id {}", tableVersion, tableId);
return false;
}
chainTable = tableIter->second;
XLOGF(INFO,
"Found version {} of chain table {}: {}",
benchOptions_.chainTableVersion,
benchOptions_.chainTableId,
chainTable.chainTableVersion);
} else {
const auto iter = --tableVersions.cend();
const auto &latestTable = iter->second;
chainTable = latestTable;
XLOGF(INFO,
"Found latest version of chain table {}: {}",
benchOptions_.chainTableId,
chainTable.chainTableVersion);
}
XLOGF(WARN,
"Selected chain table: {}@{} [{}] {} chains",
chainTable.chainTableId,
chainTable.chainTableVersion,
chainTable.desc,
chainTable.chains.size());
if (!benchOptions_.storageNodeIds.empty()) {
for (const auto &chainId : chainTable.chains) {
const auto chainInfo = routingInfo->raw()->getChain(chainId);
for (const auto &target : chainInfo->targets) {
const auto targetInfo = routingInfo->raw()->getTarget(target.targetId);
auto nodeIter = std::find(benchOptions_.storageNodeIds.begin(),
benchOptions_.storageNodeIds.end(),
*targetInfo->nodeId);
if (nodeIter != benchOptions_.storageNodeIds.end()) {
chainIds_.push_back(chainId);
break;
}
}
}
} else if (!benchOptions_.chainIds.empty()) {
for (const auto &chainId : chainTable.chains) {
auto chainIter = std::find(benchOptions_.chainIds.begin(), benchOptions_.chainIds.end(), chainId);
if (chainIter != benchOptions_.chainIds.end()) {
chainIds_.push_back(chainId);
}
}
} else {
chainIds_ = chainTable.chains;
}
break;
}
}
if (!chainIds_.empty()) break;
}
}
if (chainIds_.empty()) {
XLOGF(ERR, "Failed to get chain table with id {}", benchOptions_.chainTableId);
return false;
} else {
XLOGF(WARN, "Selected {} replication chains for benchmark", chainIds_.size());
}
// create storage client
if (setupConfig_.client_config().empty()) {
XLOGF(ERR, "Storage client config not specified");
return false;
}
auto configRes = clientConfig_.atomicallyUpdate(setupConfig_.client_config(), false /*isHotUpdate*/);
if (!configRes) {
XLOGF(ERR, "Cannot load client config from {}, error: {}", setupConfig_.client_config(), configRes.error());
return false;
}
totalNumChunks_ = chainIds_.size() * benchOptions_.numCoroutines * benchOptions_.numChunks;
totalChunkGiB_ = (double)totalNumChunks_ * setupConfig_.chunk_size() / 1_GB;
clientConfig_.retry().set_max_retry_time(Duration(std::chrono::milliseconds(benchOptions_.clientTimeoutMS)));
clientConfig_.net_client().io_worker().ibsocket().set_sl(setupConfig_.service_level());
XLOGF(INFO, "Creating storage client...");
storageClient_ = client::StorageClient::create(clientId_, clientConfig_, *mgmtdForClient_);
return true;
}
bool setupIBSock() {
XLOGF(WARN, "Setting up IB socket...");
std::vector<net::IBConfig::Subnet> subnets;
for (const auto &ibnetZoneStr : benchOptions_.ibnetZones) {
std::vector<std::string> ibnetZoneSubnet;
boost::split(ibnetZoneSubnet, ibnetZoneStr, boost::is_any_of(":"));
if (ibnetZoneSubnet.size() != 2) {
XLOGF(CRITICAL, "Invalid IB zone subnet: {}", ibnetZoneStr);
return false;
}
auto zone = boost::trim_copy(ibnetZoneSubnet[0]);
auto subnet = boost::trim_copy(ibnetZoneSubnet[1]);
if (zone.empty() || subnet.empty()) {
XLOGF(CRITICAL, "Invalid IB zone subnet: {}", ibnetZoneStr);
return false;
}
subnets.emplace_back();
subnets.back().set_network_zones({zone});
subnets.back().set_subnet(*net::IBConfig::Network::from(subnet));
XLOGF(WARN, "Add IB network zone: {} -- {}", zone, subnet);
}
net::IBConfig ibConfig;
ibConfig.set_subnets(subnets);
ibConfig.set_allow_unknown_zone(false);
ibConfig.set_default_network_zone("$HF3FS_NETWORK_ZONE");
ibConfig.set_device_filter(benchOptions_.ibvDevices);
ibConfig.set_default_pkey_index(benchOptions_.defaultPKeyIndex);
auto ibResult = net::IBManager::start(ibConfig);
if (ibResult.hasError()) {
XLOGF(CRITICAL, "Cannot initialize IB device: {}", ibResult.error());
return false;
}
return true;
}
bool setup() {
XLOGF(WARN, "Setting up benchmark...");
if (!setupIBSock()) {
return false;
}
bool ok = setUpStorageSystem();
totalNumChunks_ = chainIds_.size() * benchOptions_.numCoroutines * benchOptions_.numChunks;
totalChunkGiB_ = (double)totalNumChunks_ * setupConfig_.chunk_size() / 1_GB;
clientConfig_.retry().set_max_retry_time(Duration(std::chrono::milliseconds(benchOptions_.clientTimeoutMS)));
return ok;
}
void teardown() {
tearDownStorageSystem();
net::IBManager::stop();
}
void printThroughput(hf3fs::SteadyClock::duration elapsedMicro, double totalGiB) {
auto elapsedMilli = std::chrono::duration_cast<std::chrono::milliseconds>(elapsedMicro);
double throughput = totalGiB / (elapsedMilli.count() / 1000.0);
XLOGF(WARN, "Average throughput: {:.3f}GiB/s, total {:.3f} GiB", throughput, totalGiB);
}
void printLatencyDigest(const folly::TDigest &digest) {
XLOGF(WARN, "latency summary ({} samples)", digest.count());
XLOGF(WARN, "min: {:10.1f}us", digest.min());
XLOGF(WARN, "max: {:10.1f}us", digest.max());
XLOGF(WARN, "avg: {:10.1f}us", digest.mean());
for (double p : {0.1, 0.2, 0.5, 0.9, 0.95, 0.99}) {
XLOGF(WARN, "{}%: {:10.1f}us", p * 100.0, digest.estimateQuantile(p));
}
}
void dumpPerfStats(const std::string &testName,
const folly::TDigest &digest,
hf3fs::SteadyClock::duration elapsedTime,
double totalGiB,
bool readIO) {
if (benchOptions_.statsFilePath.empty()) return;
boost::filesystem::path outFilePath(benchOptions_.statsFilePath);
if (!boost::filesystem::exists(outFilePath) || boost::filesystem::is_empty(outFilePath)) {
XLOGF(INFO, "Create a file for perfermance stats at {}", outFilePath);
boost::filesystem::save_string_file(
outFilePath,
"test name,#storages,#chains,#replicas,concurrency,batch size,"
"io size (bytes),effective batch size (batch size / #replicas),elapsed time (us),"
"QPS,IOPS,bandwidth (MB/s),latency samples,min latency (us),max latency (us),avg latency (us),"
"latency P50 (us),latency P75 (us),latency P90 (us),latency P95 (us),latency P99 (us)\n");
}
auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(elapsedTime);
double bandwidthMBps = totalGiB * 1024.0 / (elapsedMicro.count() / 1'000'000.0);
size_t ioSize = readIO ? benchOptions_.readSize : benchOptions_.writeSize;
size_t batchSize = readIO ? benchOptions_.readBatchSize : benchOptions_.writeBatchSize;
double iops = bandwidthMBps * 1024.0 * 1024.0 / ioSize;
double qps = bandwidthMBps * 1024.0 * 1024.0 / (batchSize * ioSize);
boost::filesystem::ofstream fout(outFilePath, std::ios_base::app);
fout << fmt::format("{},{},{},{},{},{},{},{:.1f},{},{:.1f},{:.1f},{:.3f},{},{:.1f},{:.1f},{:.1f}",
testName,
setupConfig_.num_storage_nodes(),
setupConfig_.num_chains(),
setupConfig_.num_replicas(),
benchOptions_.numCoroutines,
batchSize,
ioSize,
double(batchSize) / setupConfig_.num_storage_nodes(),
elapsedMicro.count(),
qps,
iops,
bandwidthMBps,
digest.count(),
digest.min(),
digest.max(),
digest.mean());
for (double p : {0.5, 0.75, 0.9, 0.95, 0.99}) {
fout << fmt::format(",{:.1f}", digest.estimateQuantile(p));
}
fout << "\n";
fout.close();
}
CoTask<uint32_t> batchWrite(uint32_t instanceId, size_t writeBatchSize, size_t writeSize, uint32_t numWriteSecs) {
// create an aligned memory block
size_t memoryBlockSize = ALIGN_UPPER(setupConfig_.chunk_size(), benchOptions_.memoryAlignment);
auto memoryBlock = (uint8_t *)folly::aligned_malloc(memoryBlockSize, sysconf(_SC_PAGESIZE));
auto deleter = [](uint8_t *ptr) { folly::aligned_free(ptr); };
std::unique_ptr<uint8_t, decltype(deleter)> memoryBlockPtr(memoryBlock, deleter);
std::memset(memoryBlock, 0xFF, memoryBlockSize);
if (benchOptions_.verifyReadData) {
for (size_t byteIndex = 0; byteIndex < memoryBlockSize; byteIndex++) {
memoryBlock[byteIndex] = byteIndex;
}
}
// register a block of memory
auto regRes = storageClient_->registerIOBuffer(memoryBlock, memoryBlockSize);
if (regRes.hasError()) {
co_return regRes.error().code();
}
// create write IOs
auto ioBuffer = std::move(*regRes);
WriteOptions options;
options.set_enableChecksum(benchOptions_.verifyWriteChecksum);
options.debug().set_bypass_disk_io(benchOptions_.benchmarkNetwork);
options.debug().set_bypass_rdma_xmit(benchOptions_.benchmarkStorage);
options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
std::vector<double> elapsedMicroSecs;
uint64_t numWriteBytes = 0;
std::vector<WriteIO> writeIOs;
writeIOs.reserve(writeBatchSize);
auto benchStart = hf3fs::SteadyClock::now();
std::vector<ChunkInfo> &chunkInfos = chunkInfos_[instanceId];
size_t &numCreatedChunks = numCreatedChunks_[instanceId];
size_t seqChunkIndex = 0;
while (true) {
if (numWriteSecs) {
auto accumElapsedSecs =
std::chrono::duration_cast<std::chrono::seconds>(hf3fs::SteadyClock::now() - benchStart);
if (accumElapsedSecs >= std::chrono::seconds(numWriteSecs)) break;
} else {
if (numCreatedChunks >= chunkInfos.size()) break;
}
writeIOs.clear();
for (size_t writeIndex = 0; writeIndex < writeBatchSize; writeIndex++) {
auto &[chainId, chunkId, chunkSize] = chunkInfos[seqChunkIndex++ % chunkInfos.size()];
size_t writeOffset = 0;
size_t writeLength = 0;
if (chunkSize < setupConfig_.chunk_size()) {
writeOffset = chunkSize;
writeLength = std::min(writeSize, setupConfig_.chunk_size() - writeOffset);
chunkSize += writeLength;
numCreatedChunks += chunkSize == setupConfig_.chunk_size();
} else {
writeOffset = folly::Random::rand32(0, setupConfig_.chunk_size() - writeSize);
writeLength = writeSize;
}
auto writeIO = storageClient_->createWriteIO(chainId,
chunkId,
writeOffset,
writeLength,
setupConfig_.chunk_size(),
&memoryBlock[writeOffset],
&ioBuffer);
writeIOs.push_back(std::move(writeIO));
numWriteBytes += writeLength;
}
auto rpcStart = hf3fs::SteadyClock::now();
co_await storageClient_->batchWrite(writeIOs, flat::UserInfo(), options);
auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(hf3fs::SteadyClock::now() - rpcStart);
elapsedMicroSecs.push_back(elapsedMicro.count());
if (!benchOptions_.ignoreIOError) {
for (const auto &writeIO : writeIOs) {
if (writeIO.result.lengthInfo.hasError()) {
XLOGF(ERR, "Error in write result: {}", writeIO.result);
co_return writeIO.result.lengthInfo.error().code();
}
if (writeIO.length != *writeIO.result.lengthInfo) {
XLOGF(ERR, "Unexpected write length: {} != {}", *writeIO.result.lengthInfo, writeIO.length);
co_return StorageClientCode::kRemoteIOError;
}
}
}
}
folly::TDigest digest;
writeLatencyDigests_[instanceId] = digest.merge(elapsedMicroSecs);
numWriteBytes_ += numWriteBytes;
co_return StatusCode::kOK;
}
CoTask<uint32_t> batchRead(uint32_t instanceId) {
// create an aligned memory block
size_t alignedBufSize = ALIGN_UPPER(std::max(size_t(1), benchOptions_.readSize), benchOptions_.memoryAlignment);
size_t memoryBlockSize = alignedBufSize * benchOptions_.readBatchSize;
auto memoryBlock = (uint8_t *)folly::aligned_malloc(memoryBlockSize, sysconf(_SC_PAGESIZE));
auto deleter = [](uint8_t *ptr) { folly::aligned_free(ptr); };
std::unique_ptr<uint8_t, decltype(deleter)> memoryBlockPtr(memoryBlock, deleter);
std::memset(memoryBlock, 0, memoryBlockSize);
// register a block of memory
auto regRes = storageClient_->registerIOBuffer(memoryBlock, memoryBlockSize);
if (regRes.hasError()) {
co_return regRes.error().code();
}
std::vector<uint8_t> expectedChunkData(setupConfig_.chunk_size());
if (benchOptions_.verifyReadData) {
for (size_t byteIndex = 0; byteIndex < expectedChunkData.size(); byteIndex++) {
expectedChunkData[byteIndex] = byteIndex;
}
}
// create read IOs
auto ioBuffer = std::move(*regRes);
ReadOptions options;
options.set_enableChecksum(benchOptions_.verifyReadChecksum);
options.debug().set_bypass_disk_io(benchOptions_.benchmarkNetwork);
options.debug().set_bypass_rdma_xmit(benchOptions_.benchmarkStorage);
options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
std::vector<double> elapsedMicroSecs;
uint64_t numReadBytes = 0;
size_t offsetAlignment =
benchOptions_.readOffAlignment ? benchOptions_.readOffAlignment : std::max(size_t(1), benchOptions_.readSize);
std::vector<client::ReadIO> readIOs;
readIOs.reserve(benchOptions_.readBatchSize);
auto benchStart = hf3fs::SteadyClock::now();
std::vector<ChunkInfo> &chunkInfos = chunkInfos_[instanceId];
while (true) {
auto accumElapsedSecs = std::chrono::duration_cast<std::chrono::seconds>(hf3fs::SteadyClock::now() - benchStart);
if (accumElapsedSecs >= std::chrono::seconds(benchOptions_.numReadSecs)) break;
readIOs.clear();
for (size_t readIndex = 0; readIndex < benchOptions_.readBatchSize; readIndex++) {
uint64_t randChunkIndex = folly::Random::rand64(0, chunkInfos.size());
const auto &[chainId, chunkId, chunkSize] = chunkInfos[randChunkIndex];
uint32_t offset = folly::Random::rand32(0, setupConfig_.chunk_size() - benchOptions_.readSize);
uint32_t alignedOffset = ALIGN_LOWER(offset, offsetAlignment);
auto readIO = storageClient_->createReadIO(chainId,
chunkId,
alignedOffset /*offset*/,
benchOptions_.readSize /*length*/,
&memoryBlock[readIndex * alignedBufSize],
&ioBuffer);
readIOs.push_back(std::move(readIO));
numReadBytes += benchOptions_.readSize;
}
auto rpcStart = hf3fs::SteadyClock::now();
co_await storageClient_->batchRead(readIOs, flat::UserInfo(), options);
auto elapsedMicro = std::chrono::duration_cast<std::chrono::microseconds>(hf3fs::SteadyClock::now() - rpcStart);
elapsedMicroSecs.push_back(elapsedMicro.count());
if (!benchOptions_.ignoreIOError) {
for (const auto &readIO : readIOs) {
if (readIO.result.lengthInfo.hasError()) {
XLOGF(ERR, "Error in read result: {}", readIO.result);
co_return readIO.result.lengthInfo.error().code();
}
if (readIO.length != *readIO.result.lengthInfo) {
XLOGF(ERR, "Unexpected read length: {} != {}", *readIO.result.lengthInfo, readIO.length);
co_return StorageClientCode::kRemoteIOError;
}
}
}
if (benchOptions_.verifyReadData) {
for (const auto &readIO : readIOs) {
auto diffPos = std::mismatch(&readIO.data[0], &readIO.data[readIO.length], &expectedChunkData[readIO.offset]);
uint32_t byteIndex = diffPos.first - &readIO.data[0];
if (byteIndex < readIO.length) {
XLOGF(ERR,
"Wrong data at bytes index {} and chunk offset {}: data {:#x} != expected {:#x}",
byteIndex,
readIO.offset + byteIndex,
*diffPos.first,
*diffPos.second);
co_return StorageClientCode::kFoundBug;
}
}
}
}
folly::TDigest digest;
readLatencyDigests_[instanceId] = digest.merge(elapsedMicroSecs);
numReadBytes_ += numReadBytes;
co_return StatusCode::kOK;
}
uint32_t generateChunks() {
XLOGF(WARN, "Generating {} test chunks ({:.3f} GiB)...", totalNumChunks_, totalChunkGiB_);
auto testStart = hf3fs::SteadyClock::now();
std::vector<folly::SemiFuture<uint32_t>> writeTasks;
numWriteBytes_ = 0;
size_t writeBatchSize =
std::max(benchOptions_.writeBatchSize,
clientConfig_.traffic_control().write().max_concurrent_requests() / benchOptions_.numCoroutines);
for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
writeTasks.push_back(batchWrite(instanceId, writeBatchSize, setupConfig_.chunk_size(), 0 /*numWriteSecs*/)
.scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_))
.start());
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(writeTasks)));
for (auto res : results) {
if (res != StatusCode::kOK) {
XLOGF(WARN, "Test task failed with status code: {}", res);
return res;
}
}
auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
double totalGiB = (double)numWriteBytes_ / 1_GB;
printThroughput(elapsedTime, totalGiB);
auto mergedDigest = folly::TDigest::merge(writeLatencyDigests_);
printLatencyDigest(mergedDigest);
return StatusCode::kOK;
}
uint32_t runWriteBench() {
XLOGF(WARN,
"Running write benchmark ({} secs, {} chunks, {:.3f} GiB)...",
benchOptions_.numWriteSecs,
totalNumChunks_,
totalChunkGiB_);
auto testStart = hf3fs::SteadyClock::now();
std::vector<folly::SemiFuture<uint32_t>> writeTasks;
numWriteBytes_ = 0;
for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
writeTasks.push_back(
batchWrite(instanceId, benchOptions_.writeBatchSize, benchOptions_.writeSize, benchOptions_.numWriteSecs)
.scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_))
.start());
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(writeTasks)));
for (auto res : results) {
if (res != StatusCode::kOK) {
XLOGF(WARN, "Test task failed with status code: {}", res);
return res;
}
}
auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
double totalGiB = (double)numWriteBytes_ / 1_GB;
printThroughput(elapsedTime, totalGiB);
auto mergedDigest = folly::TDigest::merge(writeLatencyDigests_);
printLatencyDigest(mergedDigest);
dumpPerfStats("batch write", mergedDigest, elapsedTime, totalGiB, false /*readIO*/);
return StatusCode::kOK;
}
uint32_t runReadBench() {
XLOGF(WARN, "Running read benchmark ({} secs)...", benchOptions_.numReadSecs);
auto testStart = hf3fs::SteadyClock::now();
std::vector<folly::SemiFuture<uint32_t>> readTasks;
numReadBytes_ = 0;
for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
readTasks.push_back(batchRead(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(readTasks)));
for (auto res : results) {
if (res != StatusCode::kOK) {
XLOGF(WARN, "Test task failed with status code: {}", res);
return res;
}
}
auto elapsedTime = hf3fs::SteadyClock::now() - testStart;
double totalGiB = (double)numReadBytes_ / 1_GB;
printThroughput(elapsedTime, totalGiB);
auto mergedDigest = folly::TDigest::merge(readLatencyDigests_);
printLatencyDigest(mergedDigest);
dumpPerfStats("batch read", mergedDigest, elapsedTime, totalGiB, false /*readIO*/);
return StatusCode::kOK;
}
uint32_t cleanup() {
XLOGF(WARN, "Clean up chunks...");
std::vector<folly::SemiFuture<uint32_t>> removeTasks;
for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
auto batchRemove = [this](size_t instanceId) -> folly::coro::Task<uint32_t> {
std::vector<client::RemoveChunksOp> removeOps;
size_t totalNumChunksRemoved = 0;
for (const auto &[chainId, chunkId, chunkSize] : chunkInfos_[instanceId]) {
removeOps.push_back(storageClient_->createRemoveOp(chainId, chunkId, ChunkId(chunkId, 1)));
if (removeOps.size() >= benchOptions_.removeBatchSize) {
WriteOptions options;
options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
co_await storageClient_->removeChunks(removeOps, flat::UserInfo(), options);
for (const auto &removeOp : removeOps) {
if (removeOp.result.statusCode.hasError()) {
XLOGF(WARN, "Remove operation failed with error: {}", removeOp.result.statusCode.error());
co_return removeOp.result.statusCode.error().code();
}
XLOGF_IF(DBG5,
removeOp.result.numChunksRemoved != 1,
"{} chunks removed in range {}",
removeOp.result.numChunksRemoved,
removeOp.chunkRange());
totalNumChunksRemoved += removeOp.result.numChunksRemoved;
}
removeOps.clear();
}
}
XLOGF(WARN, "{} chunks removed by instance #{}", totalNumChunksRemoved, instanceId);
co_return StatusCode::kOK;
};
removeTasks.push_back(
batchRemove(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(removeTasks)));
for (auto res : results) {
if (res != StatusCode::kOK) {
XLOGF(WARN, "Test task failed with status code: {}", res);
return res;
}
}
return StatusCode::kOK;
};
uint32_t truncate() {
XLOGF(WARN, "Truncate chunks...");
std::vector<folly::SemiFuture<uint32_t>> truncateTasks;
for (size_t instanceId = 0; instanceId < benchOptions_.numCoroutines; instanceId++) {
auto batchTruncate = [this](size_t instanceId) -> folly::coro::Task<uint32_t> {
std::vector<client::TruncateChunkOp> truncateOps;
for (const auto &[chainId, chunkId, chunkSize] : chunkInfos_[instanceId]) {
truncateOps.push_back(storageClient_->createTruncateOp(chainId, chunkId, 0, setupConfig_.chunk_size()));
if (truncateOps.size() >= benchOptions_.writeBatchSize) {
WriteOptions options;
options.debug().set_inject_random_server_error(benchOptions_.injectRandomServerError);
options.debug().set_inject_random_client_error(benchOptions_.injectRandomClientError);
options.retry().set_retry_permanent_error(benchOptions_.retryPermanentError);
co_await storageClient_->truncateChunks(truncateOps, flat::UserInfo(), options);
for (const auto &truncateOp : truncateOps) {
if (truncateOp.result.lengthInfo.hasError()) {
XLOGF(WARN, "Truncate operation failed with error: {}", truncateOp.result.lengthInfo.error());
co_return truncateOp.result.lengthInfo.error().code();
}
}
truncateOps.clear();
}
}
co_return StatusCode::kOK;
};
truncateTasks.push_back(
batchTruncate(instanceId).scheduleOn(folly::Executor::getKeepAliveToken(testExecutor_)).start());
}
auto results = folly::coro::blockingWait(folly::coro::collectAllRange(std::move(truncateTasks)));
for (auto res : results) {
if (res != StatusCode::kOK) {
XLOGF(WARN, "Test task failed with status code: {}", res);
return res;
}
}
return StatusCode::kOK;
};
bool run() {
if (benchOptions_.numWriteSecs > 0)
if (runWriteBench() != StatusCode::kOK) return false;
if (benchOptions_.generateTestData)
if (generateChunks() != StatusCode::kOK) return false;
if (benchOptions_.numReadSecs > 0)
if (runReadBench() != StatusCode::kOK) return false;
return true;
}
uint64_t getWriteBytes() { return numWriteBytes_; }
uint64_t getReadBytes() { return numReadBytes_; }
};
} // namespace hf3fs::storage::benchmark

33
cmake/AddCrate.cmake Normal file
View File

@@ -0,0 +1,33 @@
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CARGO_CMD cargo build)
set(TARGET_DIR "debug")
else ()
set(CARGO_CMD cargo build --release)
set(TARGET_DIR "release")
endif ()
add_custom_target(
cargo_build_all ALL
COMMAND ${CARGO_CMD}
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
)
macro(add_crate NAME)
set(LIBRARY "${PROJECT_SOURCE_DIR}/target/${TARGET_DIR}/lib${NAME}.a")
set(SOURCES
"${PROJECT_SOURCE_DIR}/target/cxxbridge/${NAME}/src/cxx.rs.h"
"${PROJECT_SOURCE_DIR}/target/cxxbridge/${NAME}/src/cxx.rs.cc"
)
add_custom_command(
OUTPUT ${SOURCES} ${LIBRARY}
COMMAND ${CARGO_CMD}
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/${NAME}"
)
add_library(${NAME} STATIC ${SOURCES} ${LIBRARY})
target_link_libraries(${NAME} pthread dl ${LIBRARY})
target_include_directories(${NAME} PUBLIC "${PROJECT_SOURCE_DIR}/target/cxxbridge")
target_compile_options(${NAME} PUBLIC -Wno-dollar-in-identifier-extension)
add_dependencies(${NAME} cargo_build_all)
endmacro()

54
cmake/ApacheArrow.cmake Normal file
View File

@@ -0,0 +1,54 @@
add_library(apache_arrow_static INTERFACE)
add_library(arrow_static STATIC IMPORTED)
add_library(parquet_static STATIC IMPORTED)
add_library(arrow_dependencies STATIC IMPORTED)
set(PREFIX "${CMAKE_CURRENT_BINARY_DIR}")
set(ARROW_RELEASE_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/src/apache-arrow-cpp/cpp/build/release")
# https://cmake.org/cmake/help/latest/policy/CMP0097.html
# Starting with CMake 3.16, explicitly setting GIT_SUBMODULES to an empty string
# means no submodules will be initialized or updated.
cmake_policy(SET CMP0097 NEW)
include(ExternalProject)
ExternalProject_Add(
apache-arrow-cpp
PREFIX ${PREFIX}
GIT_REPOSITORY https://github.com/apache/arrow.git
GIT_TAG b7d2f7ffca66c868bd2fce5b3749c6caa002a7f0
GIT_SHALLOW ON
GIT_PROGRESS ON
GIT_SUBMODULES ""
SOURCE_SUBDIR "cpp"
BUILD_IN_SOURCE ON
INSTALL_DIR ${PREFIX}
CONFIGURE_COMMAND bash -x -c "\
( cd thirdparty && [[ -f export.sh ]] || ./download_dependencies.sh | tee export.sh ) && \
source thirdparty/export.sh && cmake -S . -B . \
-DCMAKE_BUILD_TYPE=Release \
-DARROW_USE_CCACHE=OFF \
-DARROW_USE_SCCACHE=OFF \
-DARROW_DEPENDENCY_SOURCE=BUNDLED \
-DARROW_BUILD_STATIC=ON \
-DARROW_JEMALLOC=ON \
-DARROW_SIMD_LEVEL=DEFAULT \
-DARROW_BUILD_EXAMPLES=OFF \
-DARROW_PARQUET=ON -DARROW_CSV=ON \
-DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_ZLIB=ON"
BUILD_COMMAND bash -x -c "source thirdparty/export.sh && cmake --build . -j"
INSTALL_COMMAND cmake --install . --prefix "${PREFIX}"
BUILD_BYPRODUCTS
"${ARROW_RELEASE_BUILD_DIR}/libarrow.a"
"${ARROW_RELEASE_BUILD_DIR}/libparquet.a"
"${ARROW_RELEASE_BUILD_DIR}/libarrow_bundled_dependencies.a"
)
add_dependencies(arrow_static apache-arrow-cpp)
add_dependencies(parquet_static apache-arrow-cpp)
add_dependencies(arrow_dependencies apache-arrow-cpp)
set_target_properties(arrow_static PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libarrow.a")
set_target_properties(parquet_static PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libparquet.a")
set_target_properties(arrow_dependencies PROPERTIES IMPORTED_LOCATION "${ARROW_RELEASE_BUILD_DIR}/libarrow_bundled_dependencies.a")
target_include_directories(apache_arrow_static SYSTEM INTERFACE "${PREFIX}/include")
target_link_libraries(apache_arrow_static INTERFACE parquet_static arrow_static arrow_dependencies)

24
cmake/CLangFormat.cmake Normal file
View File

@@ -0,0 +1,24 @@
set(CLANG_FORMAT "/usr/bin/clang-format-14")
if(EXISTS ${CLANG_FORMAT})
message(STATUS "Found clang-format at ${CLANG_FORMAT}")
set(SOURCE_DIRS
${CMAKE_SOURCE_DIR}/src
${CMAKE_SOURCE_DIR}/tests
${CMAKE_SOURCE_DIR}/benchmarks
)
# For now, it just hard codes the source files list to globs. That works
# fine until we have another directory in `src/`. We should ideally gather
# this from SOURCE_FILES list. But, should filter the thirs_party sources.
# Taking a quick route for now. We should deal with it sometime down the line.
add_custom_target(format
COMMENT "Running clang-format"
COMMAND find ${SOURCE_DIRS} -name '*.cc' -o -name '*.cpp' -o -name '*.h' | grep -v "_generated.h" | xargs ${CLANG_FORMAT} -i)
add_custom_target(check-format
COMMENT "Running clang-format"
COMMAND find ${SOURCE_DIRS} -name '*.cc' -o -name '*.cpp' -o -name '*.h' | grep -v "_generated.h" | xargs ${CLANG_FORMAT} --Werror --dry-run)
else()
message(FATAL_ERROR "clang-format-14 not found")
endif()

52
cmake/CLangTidy.cmake Normal file
View File

@@ -0,0 +1,52 @@
# clang-tidy generates too many warnings, so just disable it by default.
option(ENABLE_CLANG_TIDY "Run clang-tidy during build" OFF)
find_program(CLANG_TIDY NAMES clang-tidy-14)
if(CLANG_TIDY)
if(CMake_SOURCE_DIR STREQUAL CMake_BINARY_DIR)
message(FATAL_ERROR "CMake_RUN_CLANG_TIDY requires an out-of-source build!")
endif()
if(NOT CMAKE_EXPORT_COMPILE_COMMANDS)
message(WARNING "CMAKE_EXPORT_COMPILE_COMMANDS=OFF, clang-tidy may not works!!!")
endif()
set(HEADER_FILTER "${CMAKE_SOURCE_DIR}/\\(src\\|tests\\|benchmarks\\|demos\\)")
if(ENABLE_CLANG_TIDY)
set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} --header-filter ${HEADER_FILTER})
# Create a preprocessor definition that depends on .clang-tidy content so
# the compile command will change when .clang-tidy changes. This ensures
# that a subsequent build re-runs clang-tidy on all sources even if they
# do not otherwise need to be recompiled. Nothing actually uses this
# definition. We add it to targets on which we run clang-tidy just to
# get the build dependency on the .clang-tidy file.
file(SHA1 ${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy clang_tidy_sha1)
set(CLANG_TIDY_DEFINITIONS "CLANG_TIDY_SHA1=${clang_tidy_sha1}")
unset(clang_tidy_sha1)
configure_file(.clang-tidy .clang-tidy COPYONLY)
endif()
set(SOURCE_DIRS
${CMAKE_SOURCE_DIR}/src
${CMAKE_SOURCE_DIR}/tests
${CMAKE_SOURCE_DIR}/demos
${CMAKE_SOURCE_DIR}/benchmarks
)
# For now, it just hard codes the source files list to globs. That works
# fine until we have another directory in `src/`. We should ideally gather
# this from SOURCE_FILES list. But, should filter the thirs_party sources.
# Taking a quick route for now. We should deal with it sometime down the line.
add_custom_target(clang-tidy
COMMENT "Running clang-tidy"
COMMAND run-clang-tidy-14 -header-filter ${HEADER_FILTER} `find ${SOURCE_DIRS} -name "*.cc" -o -name "*.cpp" -not -name "*.actor.cpp" ` -quiet)
add_custom_target(clang-tidy-fix
COMMENT "Running clang-tidy -fix"
COMMAND run-clang-tidy-14 -header-filter ${HEADER_FILTER} `find ${SOURCE_DIRS} -name "*.cc" -o -name "*.cpp" -not -name "*.actor.cpp" ` -fix -quiet)
else()
message(WARNING "clang-tidy-14 not found!!!")
endif()

12
cmake/CTestCustom.cmake Normal file
View File

@@ -0,0 +1,12 @@
file (STRINGS "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" LINES)
# overwrite the file....
file(WRITE "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" "")
# loop through the lines,
foreach(LINE IN LISTS LINES)
# remove unwanted parts
string(REGEX REPLACE ".*third_party.*" "" STRIPPED "${LINE}")
# and write the (changed) line ...
file(APPEND "@CMAKE_BINARY_DIR@/CTestTestfile.cmake" "${STRIPPED}\n")
endforeach()

27
cmake/CodeCoverage.cmake Normal file
View File

@@ -0,0 +1,27 @@
option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
if(ENABLE_CODE_COVERAGE)
if(CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" )
message(STATUS "Enable code coverage with debug mode.")
else()
message(WARNING "Code coverage with no debug mode!!!")
endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(COVERAGE_COMPILER_FLAGS "-g -fprofile-arcs -ftest-coverage")
link_libraries(gcov)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang")
set(COVERAGE_COMPILER_FLAGS "-g -fprofile-instr-generate -fcoverage-mapping")
else()
message(FATAL_ERROR "Unknown compiler iid ${CMAKE_CXX_COMPILER_ID}")
endif()
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-fprofile-abs-path HAVE_fprofile_abs_path)
if(HAVE_fprofile_abs_path)
set(COVERAGE_COMPILER_FLAGS "${COVERAGE_COMPILER_FLAGS} -fprofile-abs-path")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}")
message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}")
endif()

21
cmake/CompileFlags.cmake Normal file
View File

@@ -0,0 +1,21 @@
macro(store_compile_flags)
set(ORIGINAL_C_FLAGS "${CMAKE_C_FLAGS}")
set(ORIGINAL_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
set(ORIGINAL_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
set(ORIGINAL_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
set(ORIGINAL_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
set(ORIGINAL_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(ORIGINAL_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
set(ORIGINAL_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
endmacro()
macro(restore_compile_flags)
set(CMAKE_C_FLAGS "${ORIGINAL_C_FLAGS}")
set(CMAKE_C_FLAGS_DEBUG "${ORIGINAL_C_FLAGS_DEBUG}")
set(CMAKE_C_FLAGS_RELEASE "${ORIGINAL_C_FLAGS_RELEASE}")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${ORIGINAL_C_FLAGS_RELWITHDEBINFO}")
set(CMAKE_CXX_FLAGS "${ORIGINAL_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_DEBUG "${ORIGINAL_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS_RELEASE "${ORIGINAL_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${ORIGINAL_CXX_FLAGS_RELWITHDEBINFO}")
endmacro()

15
cmake/DumpConfig.cmake Normal file
View File

@@ -0,0 +1,15 @@
if (ENABLE_FUSE_APPLICATION)
add_custom_target(dump-config
COMMENT "Running dump config"
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'admin_cli' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`.toml'
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_app_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_app.toml'
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'hf3fs_fuse_main' | xargs -I {} bash -c '{} --dump_default_launcher_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_launcher.toml')
add_dependencies(dump-config meta_main mgmtd_main storage_main admin_cli hf3fs_fuse_main)
else()
add_custom_target(dump-config
COMMENT "Running dump config"
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' -o -name 'admin_cli' | xargs -I {} bash -c '{} --dump_default_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`.toml'
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' | xargs -I {} bash -c '{} --dump_default_app_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_app.toml'
COMMAND find ${CMAKE_CURRENT_BINARY_DIR} -name 'meta_main' -o -name 'mgmtd_main' -o -name 'storage_main' | xargs -I {} bash -c '{} --dump_default_launcher_cfg > ${CMAKE_SOURCE_DIR}/configs/`basename {}`_launcher.toml')
add_dependencies(dump-config meta_main mgmtd_main storage_main admin_cli)
endif()

127
cmake/GitVersion.cmake Normal file
View File

@@ -0,0 +1,127 @@
find_package(Git REQUIRED)
if (NOT DEFINED PRE_CONFIGURE_DIR)
set(PRE_CONFIGURE_DIR ${PROJECT_SOURCE_DIR}/src/common/utils)
endif ()
if (NOT DEFINED POST_BUILD_DIR)
set(POST_BUILD_DIR ${PROJECT_BINARY_DIR})
endif ()
set(PRE_CONFIGURE_FILE ${PRE_CONFIGURE_DIR}/VersionInfo.cc.in)
set(POST_CONFIGURE_FILE ${POST_BUILD_DIR}/src/common/utils/VersionInfo.cc)
function(CheckGitWrite git_hash)
file(WRITE ${POST_BUILD_DIR}/git-state.txt ${git_hash})
endfunction()
function(CheckGitRead git_hash)
if (EXISTS ${POST_BUILD_DIR}/git-state.txt)
file(STRINGS ${POST_BUILD_DIR}/git-state.txt CONTENT)
LIST(GET CONTENT 0 var)
set(${git_hash} ${var} PARENT_SCOPE)
endif ()
endfunction()
function(CheckGitVersion)
# Get the latest abbreviated commit hash of the working branch
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse --short=8 HEAD
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_COMMIT_HASH_SHORT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_COMMIT_HASH_FULL
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --format=%at --date=local
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_TIMESTAMP
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND date -d @${BUILD_TIMESTAMP} +%Y%m%d
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_DATE
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_TAG
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND bash -c "${GIT_EXECUTABLE} describe --tags --long | sed -E 's/(.*)-([0-9]+)-(\\w+)/\\2/g'"
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE BUILD_TAG_SEQ_NUM
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(NOT DEFINED BUILD_TAG OR BUILD_TAG STREQUAL "")
set(BUILD_TAG "250228")
endif()
if(NOT DEFINED BUILD_TAG_SEQ_NUM OR BUILD_TAG_SEQ_NUM STREQUAL "")
set(BUILD_TAG_SEQ_NUM "1")
endif()
message(STATUS "Git Commit hash: ${BUILD_COMMIT_HASH_SHORT} ${BUILD_COMMIT_HASH_FULL}")
message(STATUS "Git Commit Date & Timestamp: ${BUILD_DATE} ${BUILD_TIMESTAMP}")
message(STATUS "Git Commit Tag & Seq Num: ${BUILD_TAG} ${BUILD_TAG_SEQ_NUM}")
set(BUILD_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
set(BUILD_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
set(BUILD_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
set(BUILD_VERSION "${PROJECT_VERSION}")
CheckGitRead(GIT_HASH_CACHE)
if (NOT DEFINED GIT_HASH_CACHE)
set(GIT_HASH_CACHE "INVALID")
endif ()
if (NOT DEFINED BUILD_ON_RELEASE_BRANCH)
set(BUILD_ON_RELEASE_BRANCH "false")
endif ()
if (NOT DEFINED BUILD_PIPELINE_ID)
set(BUILD_PIPELINE_ID "999999")
endif()
# Only update the git_version.cpp if the hash has changed. This will
# prevent us from rebuilding the project more than we need to.
if (NOT ${BUILD_COMMIT_HASH_FULL} STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${POST_CONFIGURE_FILE})
# Set che GIT_HASH_CACHE variable the next build won't have
# to regenerate the source file.
CheckGitWrite("${BUILD_COMMIT_HASH_FULL}")
configure_file(${PRE_CONFIGURE_FILE} ${POST_CONFIGURE_FILE} @ONLY)
endif ()
endfunction()
function(CheckGitSetup project_src_dir)
add_custom_target(AlwaysCheckGit COMMAND ${CMAKE_COMMAND}
-DRUN_CHECK_GIT_VERSION=1
-DPRE_CONFIGURE_DIR=${PRE_CONFIGURE_DIR}
-DPOST_BUILD_DIR=${POST_BUILD_DIR}
-DGIT_HASH_CACHE=${GIT_HASH_CACHE}
-DPROJECT_VERSION_MAJOR=${PROJECT_VERSION_MAJOR}
-DPROJECT_VERSION_MINOR=${PROJECT_VERSION_MINOR}
-DPROJECT_VERSION_PATCH=${PROJECT_VERSION_PATCH}
-DPROJECT_VERSION=${PROJECT_VERSION}
-DPROJECT_SOURCE_DIR=${project_src_dir}
-P ${PROJECT_SOURCE_DIR}/cmake/GitVersion.cmake
DEPENDS ${PRE_CONFIGURE_FILE}
BYPRODUCTS ${POST_CONFIGURE_FILE}
)
add_library(version-info STATIC ${POST_CONFIGURE_FILE})
target_include_directories(version-info PUBLIC ${PROJECT_SOURCE_DIR}/src)
add_dependencies(version-info AlwaysCheckGit)
CheckGitVersion()
endfunction()
if (RUN_CHECK_GIT_VERSION)
CheckGitVersion()
endif ()

20
cmake/Jemalloc.cmake Normal file
View File

@@ -0,0 +1,20 @@
add_library(jemalloc INTERFACE)
add_library(hf3fs_jemalloc_shared SHARED IMPORTED)
include(ExternalProject)
set(JEMALLOC_DIR "${CMAKE_BINARY_DIR}/third_party/jemalloc")
ExternalProject_add(Hf3fsJemalloc_project
SOURCE_DIR "${PROJECT_SOURCE_DIR}/third_party/jemalloc"
BUILD_BYPRODUCTS "${JEMALLOC_DIR}/include/jemalloc/jemalloc.h"
"${JEMALLOC_DIR}/lib/libjemalloc.so.2"
CONFIGURE_COMMAND ./autogen.sh && ./configure --prefix=${JEMALLOC_DIR} --disable-cxx --enable-prof --disable-initial-exec-tls
BUILD_IN_SOURCE ON
BUILD_COMMAND make -j 6
INSTALL_DIR "${JEMALLOC_DIR}"
INSTALL_COMMAND make install)
add_dependencies(hf3fs_jemalloc_shared Hf3fsJemalloc_project)
set_target_properties(hf3fs_jemalloc_shared PROPERTIES IMPORTED_LOCATION "${JEMALLOC_DIR}/lib/libjemalloc.so.2")
target_include_directories(hf3fs_jemalloc_shared INTERFACE "${JEMALLOC_DIR}/include")
target_link_libraries(jemalloc INTERFACE hf3fs_jemalloc_shared)

242
cmake/Sanitizers.cmake Normal file
View File

@@ -0,0 +1,242 @@
#
# Copyright (C) 2018-2022 by George Cave - gcave@stablecoder.ca
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
include(CheckCXXSourceCompiles)
set(SANITIZER "" CACHE STRING "Sanitizer: ASAN, MSAN, UBSAN, TSAN")
if(SANITIZER STREQUAL "ASAN")
message(STATUS "Build with Address Sanitizer.")
set(USE_SANITIZER "Address")
set(USE_ASAN ON)
elseif(SANITIZER STREQUAL "MSAN")
message(STATUS "Build with Memory Sanitizer.")
set(USE_SANITIZER "Memory")
set(USE_MSAN ON)
elseif(SANITIZER STREQUAL "UBSAN")
message(STATUS "Build with Undefined behavior Sanitizer.")
message(WARNING "With clang++ v14.0.6 undefined behavior sanitizer doesn't work with coroutine!!!")
set(USE_SANITIZER "Undefined")
set(USE_UBSAN ON)
elseif(SANITIZER STREQUAL "TSAN")
message(STATUS "Build with Thread Sanitizer.")
set(USE_SANITIZER "Thread")
set(USE_TSAN ON)
else()
message(STATUS "Sanitizer not enabled. ${SANITIZER}")
set(USE_SANITIZER "")
endif()
# set(USE_SANITIZER "" CACHE STRING
# "Compile with a sanitizer. Options are: Address, Memory, MemoryWithOrigins, Undefined, Thread, Leak, 'Address;Undefined', CFI"
# )
function(append value)
foreach(variable ${ARGN})
set(${variable}
"${${variable}} ${value}"
PARENT_SCOPE)
endforeach(variable)
endfunction()
function(append_quoteless value)
foreach(variable ${ARGN})
set(${variable}
${${variable}} ${value}
PARENT_SCOPE)
endforeach(variable)
endfunction()
function(test_san_flags return_var flags)
set(QUIET_BACKUP ${CMAKE_REQUIRED_QUIET})
set(CMAKE_REQUIRED_QUIET TRUE)
unset(${return_var} CACHE)
set(FLAGS_BACKUP ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${flags}")
check_cxx_source_compiles("int main() { return 0; }" ${return_var})
set(CMAKE_REQUIRED_FLAGS "${FLAGS_BACKUP}")
set(CMAKE_REQUIRED_QUIET "${QUIET_BACKUP}")
endfunction()
if(USE_SANITIZER)
append("-fno-omit-frame-pointer" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
unset(SANITIZER_SELECTED_FLAGS)
if(UNIX)
if(uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
append("-O1" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
endif()
if(USE_SANITIZER MATCHES "([Aa]ddress)")
# Optional: -fno-optimize-sibling-calls -fsanitize-address-use-after-scope
message(STATUS "Testing with Address sanitizer")
set(SANITIZER_ADDR_FLAG "-fsanitize=address")
test_san_flags(SANITIZER_ADDR_AVAILABLE ${SANITIZER_ADDR_FLAG})
if(SANITIZER_ADDR_AVAILABLE)
message(STATUS " Building with Address sanitizer")
append("${SANITIZER_ADDR_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_ASAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR
"Address sanitizer not available for ${CMAKE_CXX_COMPILER}")
endif()
endif()
if(USE_SANITIZER MATCHES "([Mm]emory([Ww]ith[Oo]rigins)?)")
# Optional: -fno-optimize-sibling-calls -fsanitize-memory-track-origins=2
set(SANITIZER_MEM_FLAG "-fsanitize=memory")
if(USE_SANITIZER MATCHES "([Mm]emory[Ww]ith[Oo]rigins)")
message(STATUS "Testing with MemoryWithOrigins sanitizer")
append("-fsanitize-memory-track-origins" SANITIZER_MEM_FLAG)
else()
message(STATUS "Testing with Memory sanitizer")
endif()
test_san_flags(SANITIZER_MEM_AVAILABLE ${SANITIZER_MEM_FLAG})
if(SANITIZER_MEM_AVAILABLE)
if(USE_SANITIZER MATCHES "([Mm]emory[Ww]ith[Oo]rigins)")
message(STATUS " Building with MemoryWithOrigins sanitizer")
else()
message(STATUS " Building with Memory sanitizer")
endif()
append("${SANITIZER_MEM_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_MSAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR
"Memory [With Origins] sanitizer not available for ${CMAKE_CXX_COMPILER}"
)
endif()
endif()
if(USE_SANITIZER MATCHES "([Uu]ndefined)")
message(STATUS "Testing with Undefined Behaviour sanitizer")
set(SANITIZER_UB_FLAG "-fsanitize=undefined")
if(EXISTS "${BLACKLIST_FILE}")
append("-fsanitize-blacklist=${BLACKLIST_FILE}" SANITIZER_UB_FLAG)
endif()
test_san_flags(SANITIZER_UB_AVAILABLE ${SANITIZER_UB_FLAG})
if(SANITIZER_UB_AVAILABLE)
message(STATUS " Building with Undefined Behaviour sanitizer")
append("${SANITIZER_UB_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_UBSAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR
"Undefined Behaviour sanitizer not available for ${CMAKE_CXX_COMPILER}"
)
endif()
endif()
if(USE_SANITIZER MATCHES "([Tt]hread)")
message(STATUS "Testing with Thread sanitizer")
set(SANITIZER_THREAD_FLAG "-fsanitize=thread -fsanitize-ignorelist=${CMAKE_SOURCE_DIR}/tsan_ignore.txt")
test_san_flags(SANITIZER_THREAD_AVAILABLE ${SANITIZER_THREAD_FLAG})
if(SANITIZER_THREAD_AVAILABLE)
message(STATUS " Building with Thread sanitizer")
append("${SANITIZER_THREAD_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_TSAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR "Thread sanitizer not available for ${CMAKE_CXX_COMPILER}"
)
endif()
endif()
if(USE_SANITIZER MATCHES "([Ll]eak)")
message(STATUS "Testing with Leak sanitizer")
set(SANITIZER_LEAK_FLAG "-fsanitize=leak")
test_san_flags(SANITIZER_LEAK_AVAILABLE ${SANITIZER_LEAK_FLAG})
if(SANITIZER_LEAK_AVAILABLE)
message(STATUS " Building with Leak sanitizer")
append("${SANITIZER_LEAK_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_LSAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR "Thread sanitizer not available for ${CMAKE_CXX_COMPILER}"
)
endif()
endif()
if(USE_SANITIZER MATCHES "([Cc][Ff][Ii])")
message(STATUS "Testing with Control Flow Integrity(CFI) sanitizer")
set(SANITIZER_CFI_FLAG "-fsanitize=cfi")
test_san_flags(SANITIZER_CFI_AVAILABLE ${SANITIZER_CFI_FLAG})
if(SANITIZER_CFI_AVAILABLE)
message(STATUS " Building with Control Flow Integrity(CFI) sanitizer")
append("${SANITIZER_LEAK_FLAG}" SANITIZER_SELECTED_FLAGS)
if(AFL)
append_quoteless(AFL_USE_CFISAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR
"Control Flow Integrity(CFI) sanitizer not available for ${CMAKE_CXX_COMPILER}"
)
endif()
endif()
message(STATUS "Sanitizer flags: ${SANITIZER_SELECTED_FLAGS}")
test_san_flags(SANITIZER_SELECTED_COMPATIBLE ${SANITIZER_SELECTED_FLAGS})
if(SANITIZER_SELECTED_COMPATIBLE)
message(STATUS " Building with ${SANITIZER_SELECTED_FLAGS}")
append("${SANITIZER_SELECTED_FLAGS}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
else()
message(
FATAL_ERROR
" Sanitizer flags ${SANITIZER_SELECTED_FLAGS} are not compatible.")
endif()
elseif(MSVC)
if(USE_SANITIZER MATCHES "([Aa]ddress)")
message(STATUS "Building with Address sanitizer")
append("-fsanitize=address" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
if(AFL)
append_quoteless(AFL_USE_ASAN=1 CMAKE_C_COMPILER_LAUNCHER
CMAKE_CXX_COMPILER_LAUNCHER)
endif()
else()
message(
FATAL_ERROR
"This sanitizer not yet supported in the MSVC environment: ${USE_SANITIZER}"
)
endif()
else()
message(FATAL_ERROR "USE_SANITIZER is not supported on this platform.")
endif()
endif()

109
cmake/Target.cmake Normal file
View File

@@ -0,0 +1,109 @@
# Check if IPO is supported
include(CheckIPOSupported)
check_ipo_supported(RESULT HAVE_IPO)
# Enable IPO in non-debug build
macro(target_enable_ipo NAME)
if(NOT CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" AND HAVE_IPO)
set_property(TARGET ${NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
message (STATUS "Enabled IPO for target: ${NAME}")
endif()
endmacro()
macro(target_add_lib NAME)
file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.h")
add_library(${NAME} STATIC ${FILES} ${FBS_FILES})
target_include_directories(${NAME}
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
target_link_libraries(${NAME} ${ARGN} "")
endmacro()
macro(target_add_shared_lib NAME)
file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc" "*.h")
add_library(${NAME} SHARED ${FILES} ${FBS_FILES})
target_include_directories(${NAME}
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
target_link_libraries(${NAME} ${ARGN} "")
target_enable_ipo(${NAME})
endmacro()
macro(target_add_bin NAME MAIN_FILE)
add_executable(${NAME} ${MAIN_FILE})
target_link_libraries(${NAME} ${ARGN} "")
target_include_directories(${NAME}
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/src/lib/api
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
set_target_properties(${NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
target_enable_ipo(${NAME})
endmacro()
macro(target_add_test NAME)
file(GLOB_RECURSE FILES CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
add_executable(${NAME} ${FILES})
target_link_libraries(${NAME} gmock test_main ${ARGN} "")
target_include_directories(${NAME}
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
add_test(NAME ${NAME} COMMAND ${NAME})
set_target_properties(${NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
endmacro()
macro(target_add_fbs NAME PATH)
cmake_parse_arguments(FBS "SERVICE" "" "DEPS" ${ARGN})
message("Fbs " ${NAME} " FBS_SERVICE " ${FBS_SERVICE} " ARGN " ${ARGN})
set(FLATBUFFERS_FLATC_SCHEMA_EXTRA_ARGS
--scoped-enums
--gen-object-api
--gen-mutable
--gen-compare
--cpp-std=c++17
--python
--hf3fs
--keep-prefix
)
get_filename_component(NAME_WE ${PATH} NAME_WE)
get_filename_component(DIR ${PATH} DIRECTORY)
build_flatbuffers(${PATH}
"${CMAKE_CURRENT_SOURCE_DIR}/${DIR};${CMAKE_SOURCE_DIR}/src"
"${NAME_WE}-generated"
""
"${CMAKE_CURRENT_BINARY_DIR}/${DIR}"
""
""
)
add_library(${NAME} INTERFACE)
target_link_libraries(${NAME} INTERFACE common)
target_include_directories(${NAME}
INTERFACE
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
add_dependencies(${NAME} "${NAME_WE}-generated" ${FBS_DEPS})
endmacro()

338
configs/admin_cli.toml Normal file
View File

@@ -0,0 +1,338 @@
break_multi_line_command_on_failure = false
cluster_id = ''
log = 'DBG:normal; normal=file:path=/var/log/3fs/cli.log,async=true,sync_level=ERR'
num_timeout_ms = 1000
profile = false
verbose = false
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[meta_client]
check_server_interval = '5s'
dynamic_stripe = false
max_concurrent_requests = 128
network_type = 'RDMA'
remove_chunks_batch_size = 32
remove_chunks_max_iters = 1024
selection_mode = 'RandomFollow'
[meta_client.background_closer]
prune_session_batch_count = 128
prune_session_batch_interval = '10s'
retry_first_wait = '100ms'
retry_max_wait = '10s'
task_scan = '50ms'
[meta_client.background_closer.coroutine_pool]
coroutines_num = 8
enable_work_stealing = false
queue_size = 128
[meta_client.retry_default]
max_failures_before_failover = 1
retry_fast = '1s'
retry_init_wait = '500ms'
retry_max_wait = '5s'
retry_send = 1
retry_total_time = '1min'
rpc_timeout = '5s'
[mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '1s'
enable_auto_extend_client_session = false
enable_auto_heartbeat = false
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100
[monitor]
collect_period = '1s'
num_collectors = 1
reporters = []
[storage_client]
check_overlapping_read_buffers = true
check_overlapping_write_buffers = false
chunk_checksum_type = 'CRC32C'
create_net_client_for_updates = false
implementation_type = 'RPC'
max_inline_read_bytes = '0'
max_inline_write_bytes = '0'
max_read_io_bytes = '0'
[storage_client.net_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[storage_client.net_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[storage_client.net_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[storage_client.net_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[storage_client.net_client.io_worker.transport_pool]
max_connections = 1
[storage_client.net_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[storage_client.net_client.rdma_control]
max_concurrent_transmission = 64
[storage_client.net_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[storage_client.net_client_for_updates]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[storage_client.net_client_for_updates.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[storage_client.net_client_for_updates.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[storage_client.net_client_for_updates.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[storage_client.net_client_for_updates.io_worker.transport_pool]
max_connections = 1
[storage_client.net_client_for_updates.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[storage_client.net_client_for_updates.rdma_control]
max_concurrent_transmission = 64
[storage_client.net_client_for_updates.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[storage_client.retry]
init_wait_time = '10s'
max_failures_before_failover = 1
max_retry_time = '1min'
max_wait_time = '30s'
[storage_client.traffic_control.query]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage_client.traffic_control.read]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage_client.traffic_control.remove]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage_client.traffic_control.truncate]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage_client.traffic_control.write]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[user_info]
gid = -1
gids = []
token = ''
uid = -1

View File

@@ -0,0 +1,557 @@
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = ''
max_file_size = '10MB'
max_files = 100
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = ''
max_file_size = '10MB'
max_files = 100
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = ''
max_file_size = '10MB'
max_files = 100
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
reporters = []
[server.agent]
allow_read_holes = true
auth_timeout = '5min'
background_ibreg = true
list_entry_limit = 100
max_concurrent_iovallocs = 0
mock_storage_dir = ''
mount_name = ''
read_only_mode = false
truncate_if_write_after_eof = false
[server.agent.inode_cache]
capacity = 4194304
entry_lifetime = '5min'
[server.agent.limit_per_process]
fd = 1048576
shm = 1048576
[server.agent.periodic_sync]
interval = '10min'
on = false
[server.agent.proc_watch]
interval = '1min'
on = true
[server.agent.storage_io.read]
enableChecksum = false
[server.agent.storage_io.read.debug]
bypass_disk_io = false
bypass_rdma_xmit = false
inject_random_client_error = false
inject_random_server_error = false
max_num_of_injection_points = 100
[server.agent.storage_io.read.retry]
init_wait_time = '0ns'
max_retry_time = '0ns'
max_wait_time = '0ns'
retry_permanent_error = false
[server.agent.storage_io.read.targetSelection]
mode = 'Default'
targetIndex = 0
trafficZone = ''
[server.agent.storage_io.write]
enableChecksum = true
[server.agent.storage_io.write.debug]
bypass_disk_io = false
bypass_rdma_xmit = false
inject_random_client_error = false
inject_random_server_error = false
max_num_of_injection_points = 100
[server.agent.storage_io.write.retry]
init_wait_time = '0ns'
max_retry_time = '0ns'
max_wait_time = '0ns'
retry_permanent_error = false
[server.agent.storage_io.write.targetSelection]
mode = 'Default'
targetIndex = 0
trafficZone = ''
[server.agent.storage_ops]
enable_read = true
enable_write = true
[server.background_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.background_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.background_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.background_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.background_client.io_worker.transport_pool]
max_connections = 1
[server.background_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.background_client.rdma_control]
max_concurrent_transmission = 64
[server.background_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 64
num_proc_threads = 64
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'LOCAL'
services = [ 'ClientAgentSerde' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
filter_list = []
listen_port = 0
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
filter_list = []
listen_port = 9000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.meta]
check_server_interval = '5s'
dynamic_stripe = false
max_concurrent_requests = 128
network_type = 'RDMA'
remove_chunks_batch_size = 32
remove_chunks_max_iters = 1024
selection_mode = 'RandomFollow'
[server.meta.background_closer]
prune_session_batch_count = 128
prune_session_batch_interval = '10s'
retry_first_wait = '100ms'
retry_max_wait = '10s'
task_scan = '50ms'
[server.meta.background_closer.coroutine_pool]
coroutines_num = 8
enable_work_stealing = false
queue_size = 128
[server.meta.retry_default]
max_failures_before_failover = 1
retry_init_wait = '500ms'
retry_max_wait = '5s'
retry_send = 1
retry_total_time = '1min'
rpc_timeout = '2s'
[server.meta.retry_truncate]
max_failures_before_failover = 1
retry_init_wait = '2s'
retry_max_wait = '5s'
retry_send = 1
retry_total_time = '1min'
rpc_timeout = '15s'
[server.mgmtd]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = true
enable_auto_heartbeat = false
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100
[server.storage]
check_overlapping_read_buffers = true
check_overlapping_write_buffers = false
chunk_checksum_type = 'CRC32C'
create_net_client_for_updates = false
implementation_type = 'RPC'
max_inline_read_bytes = '0'
max_inline_write_bytes = '0'
max_read_io_bytes = '0'
[server.storage.net_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.storage.net_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.storage.net_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.storage.net_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.storage.net_client.io_worker.transport_pool]
max_connections = 1
[server.storage.net_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.storage.net_client.rdma_control]
max_concurrent_transmission = 64
[server.storage.net_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.storage.net_client_for_updates]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.storage.net_client_for_updates.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.storage.net_client_for_updates.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.storage.net_client_for_updates.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.storage.net_client_for_updates.io_worker.transport_pool]
max_connections = 1
[server.storage.net_client_for_updates.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.storage.net_client_for_updates.rdma_control]
max_concurrent_transmission = 64
[server.storage.net_client_for_updates.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.storage.retry]
init_wait_time = '10s'
max_failures_before_failover = 1
max_retry_time = '1min'
max_wait_time = '30s'
[server.storage.traffic_control.query]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage.traffic_control.read]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage.traffic_control.remove]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage.traffic_control.truncate]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage.traffic_control.write]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,88 @@
allow_dev_version = true
cluster_id = ''
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[ib_devices]
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
skip_inactive_ports = true
subnets = []
[mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = true
enable_auto_heartbeat = false
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100

View File

@@ -0,0 +1,450 @@
attr_timeout = 30.0
batch_io_coros = 128
check_rmrf = true
chunk_size_limit = '0'
dryrun_bench_mode = false
enable_interrupt = false
enable_priority = false
enable_read_cache = true
enable_writeback_cache = false
entry_timeout = 30.0
fdatasync_update_length = false
flush_on_stat = true
fsync_length_hint = false
io_job_deq_timeout = '1ms'
io_jobq_size = 1024
iov_limit = '1MB'
max_background = 32
max_idle_threads = 10
max_jobs_per_ioring = 32
max_readahead = '16MB'
max_threads = 256
max_uid = '1M'
memset_before_read = false
negative_timeout = 5.0
notify_inval_threads = 32
rdma_buf_pool_size = 1024
readonly = false
submit_wait_jitter = '1ms'
symlink_timeout = 5.0
sync_on_stat = true
time_granularity = '1s'
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/hf3fs_fuse_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/hf3fs_fuse_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/hf3fs_fuse_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
collect_period = '1s'
num_collectors = 1
[[common.monitor.reporters]]
type = 'monitor_collector'
[common.monitor.reporters.monitor_collector]
remote_ip = ''
[io_bufs]
max_buf_size = '1MB'
max_readahead = '256KB'
write_buf_size = '1MB'
[io_jobq_sizes]
hi = 32
lo = 4096
[io_worker_coros]
hi = 8
lo = 8
[meta]
check_server_interval = '5s'
dynamic_stripe = true
max_concurrent_requests = 128
network_type = 'RDMA'
remove_chunks_batch_size = 32
remove_chunks_max_iters = 1024
selection_mode = 'RandomFollow'
[meta.background_closer]
prune_session_batch_count = 128
prune_session_batch_interval = '10s'
retry_first_wait = '100ms'
retry_max_wait = '10s'
task_scan = '50ms'
[meta.background_closer.coroutine_pool]
coroutines_num = 8
enable_work_stealing = false
queue_size = 128
[meta.retry_default]
max_failures_before_failover = 1
retry_fast = '1s'
retry_init_wait = '500ms'
retry_max_wait = '5s'
retry_send = 1
retry_total_time = '1min'
rpc_timeout = '5s'
[mgmtd]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = true
enable_auto_heartbeat = false
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100
[periodic_sync]
enable = true
flush_write_buf = true
interval = '30s'
limit = 1000
[periodic_sync.worker]
coroutines_num = 4
enable_work_stealing = false
queue_size = 1024
[storage]
check_overlapping_read_buffers = true
check_overlapping_write_buffers = false
chunk_checksum_type = 'CRC32C'
create_net_client_for_updates = false
implementation_type = 'RPC'
max_inline_read_bytes = '0'
max_inline_write_bytes = '0'
max_read_io_bytes = '0'
[storage.net_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[storage.net_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[storage.net_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[storage.net_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[storage.net_client.io_worker.transport_pool]
max_connections = 1
[storage.net_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[storage.net_client.rdma_control]
max_concurrent_transmission = 64
[storage.net_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[storage.net_client_for_updates]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[storage.net_client_for_updates.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[storage.net_client_for_updates.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[storage.net_client_for_updates.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[storage.net_client_for_updates.io_worker.transport_pool]
max_connections = 1
[storage.net_client_for_updates.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[storage.net_client_for_updates.rdma_control]
max_concurrent_transmission = 64
[storage.net_client_for_updates.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[storage.retry]
init_wait_time = '10s'
max_failures_before_failover = 1
max_retry_time = '1min'
max_wait_time = '30s'
[storage.traffic_control.query]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage.traffic_control.read]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage.traffic_control.remove]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage.traffic_control.truncate]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage.traffic_control.write]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[storage_io.read]
allowReadUncommitted = false
enableChecksum = false
[storage_io.read.debug]
bypass_disk_io = false
bypass_rdma_xmit = false
inject_random_client_error = false
inject_random_server_error = false
max_num_of_injection_points = 100
[storage_io.read.retry]
init_wait_time = '0ns'
max_retry_time = '0ns'
max_wait_time = '0ns'
retry_permanent_error = false
[storage_io.read.targetSelection]
mode = 'Default'
targetIndex = 0
trafficZone = ''
[storage_io.write]
enableChecksum = true
[storage_io.write.debug]
bypass_disk_io = false
bypass_rdma_xmit = false
inject_random_client_error = false
inject_random_server_error = false
max_num_of_injection_points = 100
[storage_io.write.retry]
init_wait_time = '0ns'
max_retry_time = '0ns'
max_wait_time = '0ns'
retry_permanent_error = false
[storage_io.write.targetSelection]
mode = 'Default'
targetIndex = 0
trafficZone = ''

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,95 @@
allow_other = true
cluster_id = ''
mountpoint = ''
token_file = ''
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = true
enable_auto_heartbeat = false
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100

633
configs/meta_main.toml Normal file
View File

@@ -0,0 +1,633 @@
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.categories]]
categories = [ 'eventlog' ]
handlers = [ 'event' ]
inherit = false
level = 'INFO'
propagate = 'ERR'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/hf3fs_meta_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/hf3fs_meta_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/hf3fs_meta_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/hf3fs_meta_main-event.log'
max_file_size = '100MB'
max_files = 10
name = 'event'
rotate = true
rotate_on_open = false
start_level = 'INFO'
stream_type = 'STDERR'
writer_type = 'EVENT'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
collect_period = '1s'
num_collectors = 1
[[common.monitor.reporters]]
type = 'monitor_collector'
[common.monitor.reporters.monitor_collector]
remote_ip = ''
[server]
use_memkv = false
[server.background_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.background_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.background_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.background_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.background_client.io_worker.transport_pool]
max_connections = 1
[server.background_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.background_client.rdma_control]
max_concurrent_transmission = 64
[server.background_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'RDMA'
services = [ 'MetaSerde' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 8001
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 9001
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
[server.kv_engine]
use_memkv = false
[server.kv_engine.fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
[server.meta]
acl_cache_time = '15s'
allow_directly_move_to_trash = false
allow_owner_change_immutable = false
allow_stat_deleted_inodes = true
authenticate = false
batch_stat_by_path_concurrent = 4
batch_stat_concurrent = 8
check_file_hole = false
dynamic_stripe = false
dynamic_stripe_growth = 2
dynamic_stripe_initial = 16
enable_new_chunk_engine = false
grv_cache = false
idempotent_record_clean = '1min'
idempotent_record_expire = '30min'
idempotent_remove = true
idempotent_rename = false
iflags_chain_allocation = false
iflags_chunk_engine = true
ignore_length_hint = false
inodeId_abort_on_duplicate = false
inodeId_check_unique = true
list_default_limit = 128
max_batch_operations = 4096
max_directory_depth = 64
max_remove_chunks_per_request = 32
max_symlink_count = 10
max_symlink_depth = 4
operation_timeout = '5s'
otrunc_replace_file = true
otrunc_replace_file_threshold = '1GB'
readonly = false
recursive_remove_check_owner = true
recursive_remove_perm_check = 1024
statfs_cache_time = '1min'
statfs_space_imbalance_threshold = 5
statfs_update_interval = '5s'
sync_on_prune_session = false
time_granularity = '1s'
[server.meta.background_hole_checker]
coroutines_num = 16
enable_work_stealing = false
queue_size = 4096
[server.meta.distributor]
timeout = '30s'
update_interval = '1s'
[server.meta.event_trace_log]
dump_interval = '30s'
enabled = true
max_num_writers = 1
max_row_group_length = 100000
trace_file_dir = '.'
[server.meta.forward]
addr_type = 'RDMA'
debug = true
timeout = '10s'
[server.meta.gc]
check_session = true
distributed_gc = true
enable = true
gc_delay_free_space_threshold = 5
gc_directory_concurrent = 4
gc_directory_delay = '0ns'
gc_directory_entry_batch = 32
gc_directory_entry_concurrent = 4
gc_file_concurrent = 32
gc_file_delay = '5min'
large_file_chunks = 128
recursive_perm_check = true
remove_chunks_batch_size = 32
retry_delay = '10min'
scan_batch = 4096
scan_interval = '200ms'
small_file_chunks = 32
txn_low_priority = false
[server.meta.gc.retry_remove_chunks]
init_wait_time = '10s'
max_retry_time = '30s'
max_wait_time = '10s'
retry_permanent_error = false
[server.meta.gc.workers]
coroutines_num = 8
enable_work_stealing = false
queue_size = 1024
[server.meta.retry_remove_chunks]
init_wait_time = '10s'
max_retry_time = '30s'
max_wait_time = '10s'
retry_permanent_error = false
[server.meta.retry_transaction]
max_backoff = '1s'
max_retry_count = 10
[server.meta.session_manager]
enable = true
scan_batch = 1024
scan_interval = '5min'
session_timeout = '5min'
sync_on_prune_session = false
[server.meta.session_manager.close_workers]
coroutines_num = 32
enable_work_stealing = false
queue_size = 1024
[server.meta.session_manager.scan_workers]
coroutines_num = 8
enable_work_stealing = false
queue_size = 128
[server.meta.user_cache]
buckets = 127
exist_ttl = '5min'
inexist_ttl = '10s'
[server.mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = false
enable_auto_heartbeat = true
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100
[server.storage_client]
check_overlapping_read_buffers = true
check_overlapping_write_buffers = false
chunk_checksum_type = 'CRC32C'
create_net_client_for_updates = false
implementation_type = 'RPC'
max_inline_read_bytes = '0'
max_inline_write_bytes = '0'
max_read_io_bytes = '0'
[server.storage_client.net_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.storage_client.net_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.storage_client.net_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.storage_client.net_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.storage_client.net_client.io_worker.transport_pool]
max_connections = 1
[server.storage_client.net_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.storage_client.net_client.rdma_control]
max_concurrent_transmission = 64
[server.storage_client.net_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.storage_client.net_client_for_updates]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.storage_client.net_client_for_updates.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.storage_client.net_client_for_updates.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.storage_client.net_client_for_updates.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.storage_client.net_client_for_updates.io_worker.transport_pool]
max_connections = 1
[server.storage_client.net_client_for_updates.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.storage_client.net_client_for_updates.rdma_control]
max_concurrent_transmission = 64
[server.storage_client.net_client_for_updates.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.storage_client.retry]
init_wait_time = '2s'
max_failures_before_failover = 1
max_retry_time = '5s'
max_wait_time = '5s'
[server.storage_client.traffic_control.query]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage_client.traffic_control.read]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage_client.traffic_control.remove]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage_client.traffic_control.truncate]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true
[server.storage_client.traffic_control.write]
max_batch_bytes = '4MB'
max_batch_size = 128
max_concurrent_requests = 32
max_concurrent_requests_per_server = 8
process_batches_in_parallel = true
random_shuffle_requests = true

View File

@@ -0,0 +1,2 @@
allow_empty_node_id = true
node_id = 0

View File

@@ -0,0 +1,93 @@
allow_dev_version = true
cluster_id = ''
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = false
enable_auto_heartbeat = true
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100

232
configs/mgmtd_main.toml Normal file
View File

@@ -0,0 +1,232 @@
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/mgmtd_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/mgmtd_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
collect_period = '1s'
num_collectors = 1
[[common.monitor.reporters]]
type = 'monitor_collector'
[common.monitor.reporters.monitor_collector]
remote_ip = ""
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'RDMA'
services = [ 'Mgmtd' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 8000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 9000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.service]
allow_heartbeat_from_unregistered = true
authenticate = false
bootstrapping_length = '2min'
bump_routing_info_version_interval = '5s'
check_status_interval = '10s'
client_session_timeout = '20min'
enable_routinginfo_cache = true
extend_lease_check_release_version = true
extend_lease_interval = '10s'
heartbeat_fail_interval = '1min'
heartbeat_ignore_stale_targets = true
heartbeat_ignore_unknown_targets = false
heartbeat_timestamp_valid_window = '30s'
lease_length = '1min'
new_chain_bootstrap_interval = '2min'
only_accept_client_uuid = false
retry_times_on_txn_errors = -1
send_heartbeat = true
send_heartbeat_interval = '10s'
suspicious_lease_interval = '20s'
target_info_load_interval = '1s'
target_info_persist_batch = 1000
target_info_persist_interval = '1s'
try_adjust_target_order_as_preferred = false
update_chains_interval = '1s'
update_metrics_interval = '1s'
validate_lease_on_write = true
[server.service.retry_transaction]
max_backoff = '1s'
max_retry_count = 10
[server.service.user_cache]
buckets = 127
exist_ttl = '5min'
inexist_ttl = '10s'

View File

@@ -0,0 +1,2 @@
allow_empty_node_id = true
node_id = 0

View File

@@ -0,0 +1,44 @@
allow_dev_version = true
cluster_id = ''
use_memkv = false
[fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[kv_engine]
use_memkv = false
[kv_engine.fdb]
casual_read_risky = false
clusterFile = ''
default_backoff = 0
enableMultipleClient = false
externalClientDir = ''
externalClientPath = ''
multipleClientThreadNum = 4
readonly = false
trace_file = ''
trace_format = 'json'

View File

@@ -0,0 +1,141 @@
[common]
cluster_id = ''
[common.ib_devices]
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
device_filter = []
subnets = []
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/monitor_collector_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/monitor_collector_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/monitor_collector_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
#default_timeout = '1s'
#drop_connections_interval = '1h'
network_type = 'TCP'
services = [ 'MonitorCollector' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drop_connections = 0
event_ack_batch = 128
#gid_index = 0
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
pkey_index = 0
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
traffic_class = 0
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
filter_list = []
listen_port = 10000
listen_queue_depth = 4096
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
[server.monitor_collector]
batch_commit_size = 4096
conn_threads = 32
queue_capacity = 204800
[server.monitor_collector.reporter]
type = 'clickhouse'
[server.monitor_collector.reporter.clickhouse]
db = ''
host = ''
passwd = ''
port = ''
user = ''

509
configs/storage_main.toml Normal file
View File

@@ -0,0 +1,509 @@
[[common.log.categories]]
categories = [ '.' ]
handlers = [ 'normal', 'err', 'fatal' ]
inherit = true
level = 'INFO'
propagate = 'NONE'
[[common.log.handlers]]
async = true
file_path = '/var/log/3fs/storage_main.log'
max_file_size = '100MB'
max_files = 10
name = 'normal'
rotate = true
rotate_on_open = false
start_level = 'NONE'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/storage_main-err.log'
max_file_size = '100MB'
max_files = 10
name = 'err'
rotate = true
rotate_on_open = false
start_level = 'ERR'
stream_type = 'STDERR'
writer_type = 'FILE'
[[common.log.handlers]]
async = false
file_path = '/var/log/3fs/storage_main-fatal.log'
max_file_size = '100MB'
max_files = 10
name = 'fatal'
rotate = true
rotate_on_open = false
start_level = 'FATAL'
stream_type = 'STDERR'
writer_type = 'STREAM'
[common.memory]
prof_active = false
prof_prefix = ''
[common.monitor]
collect_period = '1s'
num_collectors = 1
[[common.monitor.reporters]]
type = 'monitor_collector'
[common.monitor.reporters.monitor_collector]
remote_ip = ""
[server]
speed_up_quit = true
use_coroutines_pool_read = true
use_coroutines_pool_update = true
[server.aio_read_worker]
enable_io_uring = true
inflight_control_offset = 128
ioengine = 'libaio'
max_events = 512
min_complete = 128
num_threads = 32
queue_size = 4096
wait_all_inflight = false
[server.allocate_worker]
max_remain_groups = 8
max_remain_ultra_groups = 4
max_reserved_chunks = '1GB'
min_remain_groups = 4
min_remain_ultra_groups = 0
[server.base.independent_thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.base.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 32
num_proc_threads = 32
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'RDMA'
services = [ 'StorageSerde' ]
use_independent_thread_pool = false
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 8000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[[server.base.groups]]
check_connections_interval = '1min'
connection_expiration_time = '1day'
network_type = 'TCP'
services = [ 'Core' ]
use_independent_thread_pool = true
[server.base.groups.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.base.groups.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.base.groups.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.base.groups.io_worker.transport_pool]
max_connections = 1
[server.base.groups.listener]
domain_socket_index = 1
filter_list = []
listen_port = 9000
listen_queue_depth = 4096
rdma_accept_timeout = '15s'
rdma_listen_ethernet = true
reuse_port = false
[server.base.groups.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.buffer_pool]
big_rdmabuf_count = 64
big_rdmabuf_size = '64MB'
rdmabuf_count = 1024
rdmabuf_size = '4MB'
[server.check_worker]
disk_low_space_threshold = 0.95999999999999996
disk_reject_create_chunk_threshold = 0.97999999999999998
emergency_recycling_ratio = 0.94999999999999996
update_target_size_interval = '10s'
[server.client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.client.io_worker.transport_pool]
max_connections = 1
[server.client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.client.rdma_control]
max_concurrent_transmission = 64
[server.client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.coroutines_pool_default]
coroutines_num = 64
queue_size = 1024
threads_num = 8
[server.coroutines_pool_read]
coroutines_num = 64
queue_size = 1024
threads_num = 8
[server.coroutines_pool_sync]
coroutines_num = 64
queue_size = 1024
threads_num = 8
[server.coroutines_pool_update]
coroutines_num = 64
queue_size = 1024
threads_num = 8
[server.dump_worker]
dump_interval = '1day'
dump_root_path = ''
high_cpu_usage_threshold = 100
[server.forward_client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[server.forward_client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[server.forward_client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[server.forward_client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[server.forward_client.io_worker.transport_pool]
max_connections = 1
[server.forward_client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[server.forward_client.rdma_control]
max_concurrent_transmission = 64
[server.forward_client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[server.mgmtd]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = false
enable_auto_heartbeat = true
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100
[server.reliable_forwarding]
max_inline_forward_bytes = '0'
retry_first_wait = '100ms'
retry_max_wait = '1s'
retry_total_time = '1min'
[server.reliable_update]
clean_up_expired_clients = false
expired_clients_timeout = '1h'
[server.storage]
apply_transmission_before_getting_semaphore = true
batch_read_ignore_chain_version = false
batch_read_job_split_size = 1024
max_concurrent_rdma_reads = 256
max_concurrent_rdma_writes = 256
max_num_results_per_query = 100
post_buffer_per_bytes = '64KB'
rdma_transmission_req_timeout = '0ns'
read_only = false
[server.storage.event_trace_log]
dump_interval = '30s'
enabled = true
max_num_writers = 1
max_row_group_length = 100000
trace_file_dir = '.'
[server.storage.write_worker]
bg_num_threads = 8
num_threads = 32
queue_size = 4096
[server.sync_meta_kv_worker]
sync_meta_kv_interval = '1min'
[server.sync_worker]
batch_size = 16
full_sync_chains = []
full_sync_level = 'NONE'
num_channels = 1024
num_threads = 16
sync_start_timeout = '10s'
[server.sync_worker.batch_concurrency_limiter]
max_concurrency = 64
[server.sync_worker.pool]
coroutines_num = 64
enable_work_stealing = false
queue_size = 1024
[server.targets]
allow_disk_without_uuid = false
collect_all_fds = true
create_engine_path = true
space_info_cache_timeout = '5s'
target_num_per_path = 0
target_paths = []
[server.targets.storage_target]
force_persist = true
kv_path = ''
migrate_kv_store = false
mutex_num = 257
point_query_strategy = 'NONE'
[server.targets.storage_target.file_store]
preopen_chunk_size_list = []
[server.targets.storage_target.kv_store]
create_if_missing = false
integrate_leveldb_log = false
leveldb_block_cache_size = '8GB'
leveldb_iterator_fill_cache = true
leveldb_shared_block_cache = true
leveldb_sst_file_size = '16MB'
leveldb_write_buffer_size = '16MB'
rocksdb_avoid_flush_during_recovery = false
rocksdb_avoid_flush_during_shutdown = false
rocksdb_avoid_unnecessary_blocking_io = false
rocksdb_block_cache_size = '8GB'
rocksdb_block_size = '4KB'
rocksdb_bloom_filter_bits_per_key = 10
rocksdb_compression = 'kNoCompression'
rocksdb_enable_bloom_filter = true
rocksdb_enable_pipelined_write = false
rocksdb_enable_prefix_transform = true
rocksdb_keep_log_file_num = 10
rocksdb_level0_file_num_compaction_trigger = 4
rocksdb_lowest_used_cache_tier = 'kNonVolatileBlockTier'
rocksdb_max_manifest_file_size = '64MB'
rocksdb_num_levels = 7
rocksdb_prepopulate_block_cache = 'kDisable'
rocksdb_readahead_size = '2MB'
rocksdb_shared_block_cache = true
rocksdb_stats_dump_period = '2min'
rocksdb_target_file_size_base = '64MB'
rocksdb_target_file_size_multiplier = 1
rocksdb_threads_num = 8
rocksdb_unordered_write = false
rocksdb_wal_recovery_mode = 'kTolerateCorruptedTailRecords'
rocksdb_write_buffer_size = '16MB'
sync_when_write = true
type = 'LevelDB'
[server.targets.storage_target.meta_store]
allocate_size = '256MB'
punch_hole_batch_size = 16
recycle_batch_size = 256
removed_chunk_expiration_time = '3day'
removed_chunk_force_recycled_time = '1h'

View File

@@ -0,0 +1,2 @@
allow_empty_node_id = true
node_id = 0

View File

@@ -0,0 +1,93 @@
allow_dev_version = true
cluster_id = ''
[client]
default_compression_level = 0
default_compression_threshold = '128KB'
default_log_long_running_threshold = '0ns'
default_report_metrics = false
default_send_retry_times = 1
default_timeout = '1s'
enable_rdma_control = false
force_use_tcp = false
[client.io_worker]
num_event_loop = 1
rdma_connect_timeout = '5s'
read_write_rdma_in_event_thread = false
read_write_tcp_in_event_thread = false
tcp_connect_timeout = '1s'
wait_to_retry_send = '100ms'
[client.io_worker.connect_concurrency_limiter]
max_concurrency = 4
[client.io_worker.ibsocket]
buf_ack_batch = 8
buf_signal_batch = 8
buf_size = 16384
drain_timeout = '5s'
drop_connections = 0
event_ack_batch = 128
max_rd_atomic = 16
max_rdma_wr = 128
max_rdma_wr_per_post = 32
max_sge = 16
min_rnr_timer = 1
record_bytes_per_peer = false
record_latency_per_peer = false
retry_cnt = 7
rnr_retry = 0
send_buf_cnt = 32
sl = 0
start_psn = 0
timeout = 14
[client.io_worker.transport_pool]
max_connections = 1
[client.processor]
enable_coroutines_pool = true
max_coroutines_num = 256
max_processing_requests_num = 4096
response_compression_level = 1
response_compression_threshold = '128KB'
[client.rdma_control]
max_concurrent_transmission = 64
[client.thread_pool]
bg_thread_pool_stratetry = 'SHARED_QUEUE'
collect_stats = false
enable_work_stealing = false
io_thread_pool_stratetry = 'SHARED_QUEUE'
num_bg_threads = 2
num_connect_threads = 2
num_io_threads = 2
num_proc_threads = 2
proc_thread_pool_stratetry = 'SHARED_QUEUE'
[ib_devices]
allow_no_usable_devices = false
allow_unknown_zone = true
default_network_zone = 'UNKNOWN'
default_pkey_index = 0
default_roce_pkey_index = 0
default_traffic_class = 0
device_filter = []
fork_safe = true
prefer_ibdevice = true
skip_inactive_ports = true
skip_unusable_device = true
subnets = []
[mgmtd_client]
accept_incomplete_routing_info_during_mgmtd_bootstrapping = true
auto_extend_client_session_interval = '10s'
auto_heartbeat_interval = '10s'
auto_refresh_interval = '10s'
enable_auto_extend_client_session = false
enable_auto_heartbeat = true
enable_auto_refresh = true
mgmtd_server_addresses = []
work_queue_size = 100

373
deploy/README.md Normal file
View File

@@ -0,0 +1,373 @@
# 3FS Setup Guide
This section provides a manual deployment guide for setting up a six-node cluster with the cluster ID `stage`.
## Installation prerequisites
### Hardware specifications
| Node | OS | IP | Memory | SSD | RDMA |
|----------|---------------|--------------|--------|------------|-------|
| meta | Ubuntu 22.04 | 192.168.1.1 | 128GB | - | RoCE |
| storage1 | Ubuntu 22.04 | 192.168.1.2 | 512GB | 14TB × 16 | RoCE |
| storage2 | Ubuntu 22.04 | 192.168.1.3 | 512GB | 14TB × 16 | RoCE |
| storage3 | Ubuntu 22.04 | 192.168.1.4 | 512GB | 14TB × 16 | RoCE |
| storage4 | Ubuntu 22.04 | 192.168.1.5 | 512GB | 14TB × 16 | RoCE |
| storage5 | Ubuntu 22.04 | 192.168.1.6 | 512GB | 14TB × 16 | RoCE |
> **RDMA Configuration**
> 1. Assign IP addresses to RDMA NICs. Multiple RDMA NICs (InfiniBand or RoCE) are supported on each node.
> 2. Check RDMA connectivity between nodes using `ib_write_bw`.
### Third-party dependencies
In production environment, it is recommended to install FoundationDB and ClickHouse on dedicated nodes.
| Service | Node |
|------------|-------------------------|
| [ClickHouse](https://clickhouse.com/docs/install) | meta |
| [FoundationDB](https://apple.github.io/foundationdb/administration.html) | meta |
> **FoundationDB**
> 1. Ensure that the version of FoundationDB client matches the server version, or copy the corresponding version of `libfdb_c.so` to maintain compatibility.
> 2. Find the `fdb.cluster` file and `libfdb_c.so` at `/etc/foundationdb/fdb.cluster`, `/usr/lib/libfdb_c.so` on nodes with FoundationDB installed.
---
## Step 0: Build 3FS
Follow the [instructions](../README.md#build-3fs) to build 3FS. Binaries can be found in `build/bin`.
### Services and clients
The following steps show how to install 3FS services in `/opt/3fs/bin` and the config files in `/opt/3fs/etc`.
| Service | Binary | Config files | NodeID | Node |
|------------|-------------------------|-----------------------------------------------------------------------------|--------|---------------|
| monitor | monitor_collector_main | [monitor_collector_main.toml](../configs/monitor_collector_main.toml) | - | meta |
| admin_cli | admin_cli | [admin_cli.toml](../configs/admin_cli.toml)<br>fdb.cluster | - | meta<br>storage1<br>storage2<br>storage3<br>storage4<br>storage5 |
| mgmtd | mgmtd_main | [mgmtd_main_launcher.toml](../configs/mgmtd_main_launcher.toml)<br>[mgmtd_main.toml](../configs/mgmtd_main.toml)<br>[mgmtd_main_app.toml](../configs/mgmtd_main_app.toml)<br>fdb.cluster | 1 | meta |
| meta | meta_main | [meta_main_launcher.toml](../configs/meta_main_launcher.toml)<br>[meta_main.toml](../configs/meta_main.toml)<br>[meta_main_app.toml](../configs/meta_main_app.toml)<br>fdb.cluster | 100 | meta |
| storage | storage_main | [storage_main_launcher.toml](../configs/storage_main_launcher.toml)<br>[storage_main.toml](../configs/storage_main.toml)<br>[storage_main_app.toml](../configs/storage_main_app.toml) | 10001~10005 | storage1<br>storage2<br>storage3<br>storage4<br>storage5 |
| client | hf3fs_fuse_main | [hf3fs_fuse_main_launcher.toml](../configs/hf3fs_fuse_main_launcher.toml)<br>[hf3fs_fuse_main.toml](../configs/hf3fs_fuse_main.toml) | - | meta |
---
## Step 1: Create ClickHouse tables for metrics
Import the SQL file into ClickHouse:
```bash
clickhouse-client -n < ~/3fs/deploy/sql/3fs-monitor.sql
```
---
## Step 2: Monitor service
Install `monitor_collector` service on the **meta** node.
1. Copy `monitor_collector_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`, and create log directory `/var/log/3fs`.
```bash
mkdir -p /opt/3fs/{bin,etc}
mkdir -p /var/log/3fs
cp ~/3fs/build/bin/monitor_collector_main /opt/3fs/bin
cp ~/3fs/configs/monitor_collector_main.toml /opt/3fs/etc
```
2. Update [`monitor_collector_main.toml`](../configs/monitor_collector_main.toml) to add a ClickHouse connection:
```toml
[server.monitor_collector.reporter]
type = 'clickhouse'
[server.monitor_collector.reporter.clickhouse]
db = '3fs'
host = '<CH_HOST>'
passwd = '<CH_PASSWD>'
port = '<CH_PORT>'
user = '<CH_USER>'
```
3. Start monitor service:
```bash
cp ~/3fs/deploy/systemd/monitor_collector_main.service /usr/lib/systemd/system
systemctl start monitor_collector_main
```
Note that
> - Multiple instances of monitor services can be deployed behind a virtual IP address to share the traffic.
> - Other services communicate with the monitor service over a TCP connection.
---
## Step 3: Admin client
Install `admin_cli` on **all** nodes.
1. Copy `admin_cli` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
```bash
mkdir -p /opt/3fs/{bin,etc}
rsync -avz meta:~/3fs/build/bin/admin_cli /opt/3fs/bin
rsync -avz meta:~/3fs/configs/admin_cli.toml /opt/3fs/etc
rsync -avz meta:/etc/foundationdb/fdb.cluster /opt/3fs/etc
```
2. Update [`admin_cli.toml`](../configs/admin_cli.toml) to set `cluster_id` and `clusterFile`:
```toml
cluster_id = "stage"
[fdb]
clusterFile = '/opt/3fs/etc/fdb.cluster'
```
The full help documentation for `admin_cli` can be displayed by running the following command:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml help
```
---
## Step 4: Mgmtd service
Install `mgmtd` service on **meta** node.
1. Copy `mgmtd_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
```bash
cp ~/3fs/build/bin/mgmtd_main /opt/3fs/bin
cp ~/3fs/configs/{mgmtd_main.toml,mgmtd_main_launcher.toml,mgmtd_main_app.toml} /opt/3fs/etc
```
2. Update config files:
- Set mgmtd `node_id = 1` in [`mgmtd_main_app.toml`](../configs/mgmtd_main_app.toml).
- Edit [`mgmtd_main_launcher.toml`](../configs/mgmtd_main_launcher.toml) to set the `cluster_id` and `clusterFile`:
```toml
cluster_id = "stage"
[fdb]
clusterFile = '/opt/3fs/etc/fdb.cluster'
```
- Set monitor address in [`mgmtd_main.toml`](../configs/mgmtd_main.toml):
```toml
[common.monitor.reporters.monitor_collector]
remote_ip = "192.168.1.1:10000"
```
3. Initialize the cluster:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml "init-cluster --mgmtd /opt/3fs/etc/mgmtd_main.toml 1 1048576 16"
```
The parameters of `admin_cli`:
> - `1` the chain table ID
> - `1048576` the chunk size in bytes
> - `16` the file strip size
Run `help init-cluster` for full documentation.
4. Start mgmtd service:
```bash
cp ~/3fs/deploy/systemd/mgmtd_main.service /usr/lib/systemd/system
systemctl start mgmtd_main
```
5. Run `list-nodes` command to check if the cluster has been successfully initialized:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
```
If multiple instances of `mgmtd` services deployed, one of the `mgmtd` services is elected as the primary; others are secondaries. Automatic failover occurs when the primary fails.
---
## Step 5: Meta service
Install `meta` service on **meta** node.
1. Copy `meta_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
```bash
cp ~/3fs/build/bin/meta_main /opt/3fs/bin
cp ~/3fs/configs/{meta_main_launcher.toml,meta_main.toml,meta_main_app.toml} /opt/3fs/etc
```
2. Update config files:
- Set meta `node_id = 100` in [`meta_main_app.toml`](../configs/meta_main_app.toml).
- Set `cluster_id`, `clusterFile` and mgmtd address in [`meta_main_launcher.toml`](../configs/meta_main_launcher.toml):
```toml
cluster_id = "stage"
[mgmtd_client]
mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
```
- Set mgmtd and monitor addresses in [`meta_main.toml`](../configs/meta_main.toml).
```toml
[server.mgmtd_client]
mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
[common.monitor.reporters.monitor_collector]
remote_ip = "192.168.1.1:10000"
[server.fdb]
clusterFile = '/opt/3fs/etc/fdb.cluster'
```
3. Config file of meta service is managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type META --file /opt/3fs/etc/meta_main.toml"
```
4. Start meta service:
```bash
cp ~/3fs/deploy/systemd/meta_main.service /usr/lib/systemd/system
systemctl start meta_main
```
5. Run `list-nodes` command to check if meta service has joined the cluster:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
```
If multiple instances of `meta` services deployed, meta requests will be evenly distributed to all instances.
---
## Step 6: Storage service
Install `storage` service on **storage** node.
1. Format the attached 16 SSDs as XFS and mount at `/storage/data{1..16}`, then create data directories `/storage/data{1..16}/3fs` and log directory `/var/log/3fs`.
```bash
mkdir -p /storage/data{1..16}
mkdir -p /var/log/3fs
for i in {1..16};do mkfs.xfs -L data${i} /dev/nvme${i}n1;mount -o noatime,nodiratime -L data${i} /storage/data${i};done
mkdir -p /storage/data{1..16}/3fs
```
2. Increase the max number of asynchronous aio requests:
```bash
sysctl -w fs.aio-max-nr=67108864
```
3. Copy `storage_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
```bash
rsync -avz meta:~/3fs/build/bin/storage_main /opt/3fs/bin
rsync -avz meta:~/3fs/configs/{storage_main_launcher.toml,storage_main.toml,storage_main_app.toml} /opt/3fs/etc
```
4. Update config files:
- Set `node_id` in [`storage_main_app.toml`](../configs/storage_main_app.toml). Each storage service is assigned a unique id between `10001` and `10005`.
- Set `cluster_id` and mgmtd address in [`storage_main_launcher.toml`](../configs/storage_main_launcher.toml).
```toml
cluster_id = "stage"
[mgmtd_client]
mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
```
- Add target paths in [`storage_main.toml`](../configs/storage_main.toml):
```toml
[server.mgmtd]
mgmtd_server_address = ["RDMA://192.168.1.1:8000"]
[common.monitor.reporters.monitor_collector]
remote_ip = "192.168.1.1:10000"
[server.targets]
target_paths = ["/storage/data1/3fs","/storage/data2/3fs","/storage/data3/3fs","/storage/data4/3fs","/storage/data5/3fs","/storage/data6/3fs","/storage/data7/3fs","/storage/data8/3fs","/storage/data9/3fs","/storage/data10/3fs","/storage/data11/3fs","/storage/data12/3fs","/storage/data13/3fs","/storage/data14/3fs","/storage/data15/3fs","/storage/data16/3fs",]
```
5. Config file of storage service is managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type STORAGE --file /opt/3fs/etc/storage_main.toml"
```
6. Start storage service:
```bash
rsync -avz meta:~/3fs/deploy/systemd/storage_main.service /usr/lib/systemd/system
systemctl start storage_main
```
7. Run `list-nodes` command to check if storage service has joined the cluster:
```
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-nodes"
```
---
## Step 7: Create admin user, storage targets and chain table
1. Create an admin user:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "user-add --root --admin 0 root"
```
Save the admin token to `/opt/3fs/etc/token.txt`.
2. Generate `admin_cli` commands to create storage targets on 5 storage nodes (16 SSD per node, 6 targets per SSD).
- Follow instructions at [here](data_placement/README.md) to install Python packages.
```bash
python ~/3fs/deploy/data_placement/src/model/data_placement.py \
-ql -relax -type CR --num_nodes 5 --replication_factor 3 --min_targets_per_disk 6
python ~/3fs/deploy/data_placement/src/setup/gen_chain_table.py \
--chain_table_type CR --node_id_begin 10001 --node_id_end 10005 \
--num_disks_per_node 16 --num_targets_per_disk 6 \
--target_id_prefix 1 --chain_id_prefix 9 \
--incidence_matrix_path output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1/incidence_matrix.pickle
```
The following 3 files will be generated in `output` directory: `create_target_cmd.txt`, `generated_chains.csv`, and `generated_chain_table.csv`.
3. Create storage targets:
```bash
/opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") < output/create_target_cmd.txt
```
4. Upload chains to mgmtd service:
```bash
/opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") "upload-chains output/generated_chains.csv"
```
5. Upload chain table to mgmtd service:
```bash
/opt/3fs/bin/admin_cli --cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' --config.user_info.token $(<"/opt/3fs/etc/token.txt") "upload-chain-table --desc stage 1 output/generated_chain_table.csv"
```
6. List chains and chain tables to check if they have been correctly uploaded:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-chains"
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "list-chain-tables"
```
---
## Step 8: FUSE client
For simplicity FUSE client is deployed on the **meta** node in this guide. However, we strongly advise against deploying clients on service nodes in production environment.
1. Copy `hf3fs_fuse_main` to `/opt/3fs/bin` and config files to `/opt/3fs/etc`.
```bash
cp ~/3fs/build/bin/hf3fs_fuse_main /opt/3fs/bin
cp ~/3fs/configs/{hf3fs_fuse_main_launcher.toml,hf3fs_fuse_main.toml,hf3fs_fuse_main_app.toml} /opt/3fs/etc
```
2. Create the mount point:
```bash
mkdir -p /3fs/stage
```
3. Set cluster ID, mountpoint, token file and mgmtd address in [`hf3fs_fuse_main_launcher.toml`](../configs/hf3fs_fuse_main_launcher.toml)
```toml
cluster_id = "stage"
mountpoint = '/3fs/stage'
token_file = '/opt/3fs/etc/token.txt'
[mgmtd_client]
mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
```
4. Set mgmtd and monitor address in [`hf3fs_fuse_main.toml`](../configs/hf3fs_fuse_main.toml).
```toml
[mgmtd]
mgmtd_server_addresses = ["RDMA://192.168.1.1:8000"]
[common.monitor.reporters.monitor_collector]
remote_ip = "192.168.1.1:10000"
```
5. Config file of FUSE client is also managed by mgmtd service. Use `admin_cli` to upload the config file to mgmtd:
```bash
/opt/3fs/bin/admin_cli -cfg /opt/3fs/etc/admin_cli.toml --config.mgmtd_client.mgmtd_server_addresses '["RDMA://192.168.1.1:8000"]' "set-config --type FUSE --file /opt/3fs/etc/hf3fs_fuse_main.toml"
```
6. Start FUSE client:
```bash
cp ~/3fs/deploy/systemd/hf3fs_fuse_main.service /usr/lib/systemd/system
systemctl start hf3fs_fuse_main
```
7. Check if 3FS has been mounted at `/3fs/stage`:
```bash
mount | grep '/3fs/stage'
```
## FAQ
<details>
<summary>How to troubleshoot <code>admin_cli init-cluster</code> error?</summary>
If mgmtd fails to start after running `init-cluster`, the most likely cause is an error in `mgmtd_main.toml`. Any changes to this file require clearing all FoundationDB data and re-running `init-cluster`
</details>
---
<details>
<summary>How to build a single-node cluster?</summary>
A minimum of two storage services is required for data replication. If set `--num-nodes=1`, the `gen_chain_table.py` script will fail. In a test environment, this limitation can be bypassed by deploying multiple storage services on a single machine.
</details>
---
<details>
<summary>How to update config files?</summary>
All config files are managed by mgmtd. If any `*_main.toml` is updated, such as `storage_main.toml`, the modified file should be uploaded using `admin_cli set-config`.
</details>
---
<details>
<summary>How to troubleshoot common deployment issues?</summary>
When encountering any error during deployment,
- Check the log messages in `stdout/stderr` using `journalctl`, especially during service startup.
- Check log files stored in `/var/log/3fs/` on service and client nodes.
- Ensure that the directory `/var/log/3fs/` exists before starting any service.
</details>

17
deploy/data_placement/.gitignore vendored Normal file
View File

@@ -0,0 +1,17 @@
__pycache__
.ipynb_checkpoints
.tmp/
dist/
build/
output/
*.egg-info/
test/scratch/
test/runtime/
*.log
*.pyc
*.xml
.tmp/
.idea
.coverage
.vscode/
.hypothesis/

View File

@@ -0,0 +1,60 @@
# How to generate chain tables
Suppose we are going to setup a small 3FS cluster:
- 3 replicas for each chunk
- 5 storage nodes: `10001 ... 10005`
- 16 SSDs attached to each node
- 6 storage targets on each SSD
First generate a solution of the data placement problem.
```bash
$ python src/model/data_placement.py -ql -relax -type CR --num_nodes 5 --replication_factor 3 --min_targets_per_disk 6 --init_timelimit 600
...
2025-02-24 14:25:13.623 | SUCCESS | __main__:solve:165 - optimal solution:
- Status: ok
Termination condition: optimal
Termination message: TerminationCondition.optimal
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 1,2: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 1,3: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 1,4: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 1,5: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 2,1: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 2,3: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 2,4: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 2,5: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 3,1: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 3,2: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 3,4: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 3,5: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 4,1: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 4,2: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 4,3: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 4,5: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 5,1: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 5,2: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 5,3: 1.5
2025-02-24 14:25:13.624 | DEBUG | __main__:check_solution:322 - 5,4: 1.5
2025-02-24 14:25:13.624 | INFO | __main__:check_solution:331 - min_peer_traffic=1.5 max_peer_traffic=1.5
2025-02-24 14:25:13.624 | INFO | __main__:check_solution:332 - total_traffic=30.0 max_total_traffic=30
2025-02-24 14:25:14.147 | SUCCESS | __main__:run:147 - saved solution to: output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1
```
Note that some combinations of `--num_nodes` and `--replication_factor` may have no solution.
Then generate commands to create/remove storage targets.
```bash
$ python src/setup/gen_chain_table.py --chain_table_type CR --node_id_begin 10001 --node_id_end 10005 --num_disks_per_node 16 --num_targets_per_disk 6 --incidence_matrix_path output/DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1/incidence_matrix.pickle
$ ls -1 output/
DataPlacementModel-v_5-b_10-r_6-k_3-λ_2-lb_1-ub_1
appsi_highs.log
create_target_cmd.txt
generated_chain_table.csv
generated_chains.csv
remove_target_cmd.txt
```

View File

@@ -0,0 +1,12 @@
psutil
pandas
plotly
loguru
highspy==1.8.0
pyomo==6.8.0
coverage~=7.4.4
pytest==8.2.1
pytest-cov==5.0.0
pytest-forked==1.6.0
pytest-xdist==3.6.1
pytest-timeout==2.3.1

View File

View File

@@ -0,0 +1,549 @@
import math
import pickle
import random
import time
import psutil
import os.path
import pandas as pd
import pyomo.environ as po
import plotly.express as px
from typing import Dict, Generator, Literal, Tuple
from loguru import logger
from pyomo.opt import SolverStatus, TerminationCondition
class InfeasibleModel(Exception):
pass
class SolverTimeout(Exception):
pass
class SolverError(Exception):
pass
class InvalidSolution(Exception):
pass
class DataPlacementModel(object):
def __init__(self, chain_table_type: Literal["EC", "CR"], num_nodes, group_size, num_groups=None, num_targets_per_disk=None, min_targets_per_disk=1, bibd_only=False, qlinearize=False, relax_lb=1, relax_ub=0):
if num_targets_per_disk is None:
num_nodes, num_groups, num_targets_per_disk, group_size = DataPlacementModel.find_params(num_nodes, group_size, min_r=min_targets_per_disk, bibd_only=bibd_only)
self.chain_table_type = chain_table_type
self.num_nodes = num_nodes
self.group_size = group_size
self.num_targets_per_disk = num_targets_per_disk
self.num_groups = num_groups or self.num_targets_total // self.group_size
self.bibd_only = bibd_only
self.qlinearize = qlinearize
self.relax_lb = relax_lb
self.relax_ub = relax_ub
def __repr__(self):
v, b, r, k, λ = self.v, self.b, self.r, self.k, self.λ
lb, ub = self.relax_lb, self.relax_ub
return f"{self.__class__.__name__}-{v=},{b=},{r=},{k=},{λ=},{lb=},{ub=}"
__str__ = __repr__
@property
def path_name(self):
return str(self).translate(str.maketrans(' ,:=', '---_'))
@property
def v(self):
return self.num_nodes
@property
def b(self):
return self.num_groups
@property
def r(self):
return self.num_targets_per_disk
@property
def k(self):
return self.group_size
@property
def λ(self):
return self.max_recovery_traffic_on_peer
@property
def num_targets_used(self):
return self.num_groups * self.group_size
@property
def num_targets_total(self):
return self.num_nodes * self.num_targets_per_disk
@property
def all_targets_used(self):
return self.num_targets_used == self.num_targets_total
@property
def balanced_peer_traffic(self):
return self.all_targets_used and self.sum_recovery_traffic_per_failure % (self.num_nodes-1) == 0
@property
def recovery_traffic_factor(self):
return (self.group_size - 1) if self.chain_table_type == "EC" else 1
@property
def sum_recovery_traffic_per_failure(self):
return self.num_targets_per_disk * self.recovery_traffic_factor
@property
def max_recovery_traffic_on_peer(self):
return math.ceil(self.sum_recovery_traffic_per_failure / (self.num_nodes-1))
@property
def balanced_incomplete_block_design(self):
return self.bibd_only and self.balanced_peer_traffic and self.relax_ub == 0
@staticmethod
def find_params(v, k, min_r=1, max_r=100, bibd_only=False):
if bibd_only: min_r = max(min_r, k)
for r in range(min_r, max_r):
if v * r % k == 0 and r * (k - 1) >= v - 1:
b = v * r // k
if not bibd_only or r * (k - 1) % (v - 1) == 0:
return v, b, r, k
raise ValueError(f"cannot find valid params: {v=}, {k=}")
def run(self, pyomo_solver=None, threads=psutil.cpu_count(logical=False), init_timelimit=1800, max_timelimit=3600*2, auto_relax=False, output_root="output", verbose=False, add_elapsed_time=None):
init_relax_lb = self.relax_lb
init_relax_ub = self.relax_ub
timelimit = 0
num_loops = self.max_recovery_traffic_on_peer*2
os.makedirs(output_root, exist_ok=True)
for loop in range(num_loops):
try:
logger.info(f"solving model with {pyomo_solver} #{loop}: {self}")
if add_elapsed_time is not None:
add_elapsed_time()
timelimit = min(timelimit + init_timelimit, max_timelimit)
instance = self.solve(pyomo_solver, threads, timelimit, output_root, verbose)
if add_elapsed_time is not None:
add_elapsed_time(f"solve model time (lb={self.relax_lb}, ub={self.relax_ub})")
except (InfeasibleModel, SolverTimeout) as ex:
logger.error(f"cannot find solution for current params: {ex}")
if auto_relax:
self.relax_lb = init_relax_lb + (loop+1) // 2
self.relax_ub = init_relax_ub + (loop+2) // 2
continue
elif loop + 1 < num_loops:
logger.critical(f"failed to find solution after {num_loops} attempts")
raise ex
else:
raise ex
else:
output_path = os.path.join(output_root, self.path_name)
os.makedirs(output_path, exist_ok=True)
self.save_solution(instance, output_path)
self.visualize_solution(instance, output_path)
logger.success(f"saved solution to: {output_path}")
return instance
logger.catch(reraise=True, message="failed to solve model")
def solve(self, pyomo_solver=None, threads=psutil.cpu_count(logical=False), timelimit=3600, output_path="output", verbose=False):
if "highs" in pyomo_solver:
self.qlinearize = True
instance = self.build_model()
if verbose: instance.pprint()
try:
results = self.solve_model(instance, pyomo_solver, threads, timelimit, output_path)
except RuntimeError as ex:
raise SolverError("unknown runtime error") from ex
if (results.solver.status == SolverStatus.ok) and (results.solver.termination_condition == TerminationCondition.optimal):
logger.success(f"optimal solution: {str(results.solver)}")
if pyomo_solver is not None: instance.solutions.load_from(results)
elif results.solver.termination_condition == TerminationCondition.infeasible:
raise InfeasibleModel(f"infeasible: {str(results.solver)}")
elif results.solver.termination_condition in (TerminationCondition.maxTimeLimit, TerminationCondition.maxIterations):
raise SolverTimeout(f"timeout: {str(results.solver)}")
else:
raise SolverError(f"error: {str(results.solver)}")
if verbose: self.print_solution(instance)
try:
self.check_solution(instance)
except AssertionError as ex:
raise InvalidSolution from ex
return instance
def build_model(self):
logger.info(f"{self.num_nodes=} {self.num_targets_per_disk=} {self.group_size=} {self.num_groups=} {self.qlinearize=} {self.relax_lb=} {self.relax_ub=}")
# v >= k
assert self.num_nodes >= self.group_size, f"{self.num_nodes=} < {self.group_size=}"
# Fisher's inequality
if self.balanced_incomplete_block_design:
# b >= v
assert self.num_groups >= self.num_nodes, f"{self.num_groups=} < {self.num_nodes=}"
# r >= k
assert self.num_targets_per_disk >= self.group_size, f"{self.num_targets_per_disk=} < {self.group_size=}"
logger.info(f"{self.sum_recovery_traffic_per_failure=} {self.max_recovery_traffic_on_peer=}")
if self.sum_recovery_traffic_per_failure < self.num_nodes - 1:
logger.warning(f"some disks do not share recovery traffic: {self.sum_recovery_traffic_per_failure=} < {self.num_nodes=} - 1")
logger.info(f"{self.all_targets_used=} {self.balanced_peer_traffic=}")
logger.info(f"{self.num_targets_used=} {self.num_targets_total=}")
if self.num_targets_used < self.num_targets_total:
logger.warning(f"some disks have unused targets: {self.num_targets_used=} < {self.num_targets_total=}")
else:
assert self.num_targets_used == self.num_targets_total, f"{self.num_targets_used=} > {self.num_targets_total=}"
model = po.ConcreteModel()
# index sets
model.disks = po.RangeSet(1, self.num_nodes)
model.target_idxs = po.RangeSet(1, self.num_targets_per_disk)
model.targets = model.disks * model.target_idxs
model.groups = po.RangeSet(1, self.num_groups)
def disk_pairs_init(model):
for disk in model.disks:
for peer in model.disks:
if peer > disk:
yield (disk, peer)
model.disk_pairs = po.Set(dimen=2, initialize=disk_pairs_init)
# variables
model.disk_used_by_group = po.Var(model.disks, model.groups, domain=po.Binary)
if self.qlinearize:
model.disk_in_same_group = po.Var(model.disk_pairs, model.groups, domain=po.Binary)
# constraints
def calc_disk_in_same_group(model, disk, peer, group):
return model.disk_used_by_group[disk,group] * model.disk_used_by_group[peer,group]
def define_disk_in_same_group_lower_bound(model, disk, peer, group):
return model.disk_used_by_group[disk,group] + model.disk_used_by_group[peer,group] <= model.disk_in_same_group[disk,peer,group] + 1
def define_disk_in_same_group_upper_bound1(model, disk, peer, group):
return model.disk_in_same_group[disk,peer,group] <= model.disk_used_by_group[disk,group]
def define_disk_in_same_group_upper_bound2(model, disk, peer, group):
return model.disk_in_same_group[disk,peer,group] <= model.disk_used_by_group[peer,group]
if self.qlinearize:
model.define_disk_in_same_group_lower_bound_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_lower_bound)
model.define_disk_in_same_group_upper_bound1_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_upper_bound1)
model.define_disk_in_same_group_upper_bound2_eqn = po.Constraint(model.disk_pairs, model.groups, rule=define_disk_in_same_group_upper_bound2)
def each_disk_has_limited_capcity(model, disk):
if self.all_targets_used:
return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups) == self.num_targets_per_disk
else:
return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups) <= self.num_targets_per_disk
model.each_disk_has_limited_capcity_eqn = po.Constraint(model.disks, rule=each_disk_has_limited_capcity)
def enough_disks_assigned_to_each_group(model, group):
return po.quicksum(model.disk_used_by_group[disk,group] for disk in model.disks) == self.group_size
model.enough_disks_assigned_to_each_group_eqn = po.Constraint(model.groups, rule=enough_disks_assigned_to_each_group)
def calc_peer_recovery_traffic(model, disk, peer):
if self.qlinearize:
return po.quicksum(model.disk_in_same_group[disk,peer,group] for group in model.groups)
else:
return po.quicksum(calc_disk_in_same_group(model, disk, peer, group) for group in model.groups)
def peer_recovery_traffic_upper_bound(model, disk, peer):
if self.balanced_incomplete_block_design:
return calc_peer_recovery_traffic(model, disk, peer) == self.max_recovery_traffic_on_peer
else:
return calc_peer_recovery_traffic(model, disk, peer) <= self.max_recovery_traffic_on_peer + self.relax_ub
model.peer_recovery_traffic_upper_bound_eqn = po.Constraint(model.disk_pairs, rule=peer_recovery_traffic_upper_bound)
def peer_recovery_traffic_lower_bound(model, disk, peer):
return calc_peer_recovery_traffic(model, disk, peer) >= max(0, self.max_recovery_traffic_on_peer - self.relax_lb)
if self.balanced_incomplete_block_design:
logger.info(f"lower bound not needed for balanced incomplete block design (BIBD)")
elif self.all_targets_used:
logger.info(f"lower bound imposed on peer traffic: {self.relax_lb=} {self.qlinearize=} {self.all_targets_used=}")
model.peer_recovery_traffic_lower_bound_eqn = po.Constraint(model.disk_pairs, rule=peer_recovery_traffic_lower_bound)
else:
logger.info(f"lower bound not imposed on peer traffic: {self.relax_lb=} {self.qlinearize=} {self.all_targets_used=}")
def total_recovery_traffic(model):
return po.summation(model.disk_in_same_group) * 2
# model.obj = po.Objective(rule=total_recovery_traffic, sense=po.minimize)
model.obj = po.Objective(expr=1) # dummy objective
return model
def solve_model(self, instance, pyomo_solver, threads, timelimit, output_path):
if pyomo_solver is not None:
solver = po.SolverFactory(pyomo_solver)
return solver.solve(instance, options={"threads": str(threads), "log_file": os.path.join(output_path, f"{pyomo_solver}.log")}, load_solutions=False, timelimit=timelimit, tee=True)
else:
raise ValueError(f"no solver specified")
def get_peer_traffic(self, instance) -> Dict[Tuple[int,int], int]:
peer_traffic_map = {}
for disk in instance.disks:
for peer in instance.disks:
if disk == peer: continue
peer_traffic_map[(disk, peer)] = sum(
po.value(instance.disk_used_by_group[disk,group]) *
po.value(instance.disk_used_by_group[peer,group])
for group in instance.groups) * self.recovery_traffic_factor / (self.group_size - 1)
return peer_traffic_map
def get_incidence_matrix(self, instance) -> Dict[Tuple[int, int], bool]:
incidence_matrix = {}
for disk in instance.disks:
for group in instance.groups:
val = instance.disk_used_by_group[disk,group]
if math.isclose(po.value(val), 1):
incidence_matrix[(disk,group)] = True
if self.all_targets_used:
assert len(incidence_matrix) % self.num_nodes == 0, f"{len(incidence_matrix)=} % {self.num_nodes=}"
assert len(incidence_matrix) % self.num_groups == 0, f"{len(incidence_matrix)=} % {self.num_groups=}"
return incidence_matrix
def check_solution(self, instance):
has_peer_traffic_lower_bound = False
for c in instance.component_objects(po.Constraint):
if "peer_recovery_traffic_lower_bound_eqn" in str(c):
has_peer_traffic_lower_bound = True
peer_traffic_map = self.get_peer_traffic(instance)
for (disk, peer), peer_traffic in peer_traffic_map.items():
logger.debug(f"{disk},{peer}: {peer_traffic:.1f}")
assert peer_traffic <= self.max_recovery_traffic_on_peer + self.relax_ub + 1e-5, f"{peer_traffic=} > {self.max_recovery_traffic_on_peer=} + {self.relax_ub}"
if has_peer_traffic_lower_bound:
assert peer_traffic >= max(0, self.max_recovery_traffic_on_peer - self.relax_lb) - 1e-5, f"{peer_traffic=} < {self.max_recovery_traffic_on_peer=} - {self.relax_lb}"
min_peer_traffic = min(peer_traffic_map.values())
max_peer_traffic = max(peer_traffic_map.values())
total_traffic = sum(peer_traffic_map.values())
max_total_traffic = self.num_nodes * self.sum_recovery_traffic_per_failure
logger.info(f"{min_peer_traffic=:.1f} {max_peer_traffic=:.1f}")
logger.info(f"{total_traffic=} {max_total_traffic=}")
peer_traffic_diff = max_peer_traffic - min_peer_traffic
if has_peer_traffic_lower_bound:
assert peer_traffic_diff <= self.relax_ub + self.relax_lb + 1e-5, f"{peer_traffic_diff=}"
if self.balanced_incomplete_block_design:
assert math.isclose(peer_traffic_diff, 0.0, abs_tol=1e-9), f"{peer_traffic_diff=}"
assert total_traffic <= max_total_traffic + 1e-5
return total_traffic, min_peer_traffic, max_peer_traffic
def print_solution(self, instance):
for disk in instance.disks:
for group in instance.groups:
val = instance.disk_used_by_group[disk,group]
if math.isclose(po.value(val), 1):
logger.info(f"{val}: {po.value(val)}")
def save_solution(self, instance, output_path: str="output"):
incidence_matrix = self.get_incidence_matrix(instance)
with open(os.path.join(output_path, "incidence_matrix.pickle"), "wb") as fout:
pickle.dump(incidence_matrix, fout)
peer_traffic_map = self.get_peer_traffic(instance)
with open(os.path.join(output_path, "peer_traffic_map.pickle"), "wb") as fout:
pickle.dump(peer_traffic_map, fout)
def visualize_solution(self, instance, output_path: str="output", write_html=True):
incidence_matrix = self.get_incidence_matrix(instance)
disks, groups = zip(*incidence_matrix.keys())
incidence_df = pd.DataFrame(zip(disks, groups), columns=["disk", "group"])
peer_traffic_map = self.get_peer_traffic(instance)
min_peer_traffic = min(peer_traffic_map.values())
max_peer_traffic = max(peer_traffic_map.values())
fig = px.scatter(
incidence_df,
x="disk",
y="group",
title=f"{self}, min/max peer traffic: {min_peer_traffic:.1f}/{max_peer_traffic:.1f}")
fig.update_layout(
xaxis_title="Nodes",
yaxis_title="Groups",
xaxis = dict(
tickmode = 'array',
tickvals = list(range(1, self.num_nodes+1)),
),
yaxis = dict(
tickmode = 'array',
tickvals = list(range(1, self.num_groups+1)),
),
)
if write_html:
fig.write_html(os.path.join(output_path, f"data_placement.html"), include_plotlyjs=True)
return fig
class RebalanceTrafficModel(DataPlacementModel):
def __init__(self, existing_incidence_matrix, chain_table_type: Literal["EC", "CR"], num_nodes, group_size, num_groups=None, num_targets_per_disk=None, min_targets_per_disk=1, bibd_only=False, qlinearize=False, relax_lb=1, relax_ub=0):
self.existing_incidence_matrix = existing_incidence_matrix
self.existing_disks, self.existing_groups = zip(*existing_incidence_matrix.keys())
num_existing_targets_per_disk = math.ceil(self.total_existing_targets / self.num_existing_disk)
min_targets_per_disk = max(min_targets_per_disk, num_existing_targets_per_disk)
if num_targets_per_disk is None:
num_nodes, num_groups, num_targets_per_disk, group_size = DataPlacementModel.find_params(num_nodes, group_size, min_r=min_targets_per_disk, bibd_only=bibd_only)
else:
assert num_targets_per_disk >= min_targets_per_disk
super().__init__(chain_table_type, num_nodes, group_size, num_groups, num_targets_per_disk, min_targets_per_disk, bibd_only, qlinearize, relax_lb, relax_ub)
@property
def num_existing_disk(self):
return max(self.existing_disks)
@property
def num_existing_groups(self):
return max(self.existing_groups)
@property
def total_existing_targets(self):
return len(self.existing_disks)
@property
def existing_group_size(self):
assert self.total_existing_targets % self.num_existing_groups == 0, f"{self.total_existing_targets=} % {self.num_existing_groups=}"
return self.total_existing_targets // self.num_existing_groups
def build_model(self):
max_existing_targets_per_disk = math.ceil(self.total_existing_targets / self.num_nodes)
logger.info(f"{self.num_existing_disk=} {self.num_existing_groups=} {self.total_existing_targets=} {max_existing_targets_per_disk=}")
assert self.num_nodes >= self.num_existing_disk, f"{self.num_nodes=} < {self.num_existing_disk=}"
assert self.num_groups >= self.num_existing_groups, f"{self.num_groups=} < {self.num_existing_groups=}"
assert self.group_size == self.existing_group_size, f"{self.group_size=} != {self.existing_group_size=}"
assert self.num_targets_per_disk >= max_existing_targets_per_disk, f"{self.num_targets_per_disk=} >= {max_existing_targets_per_disk=}"
model = super().build_model()
def existing_targets_evenly_distributed_to_disks(model, disk):
return po.quicksum(model.disk_used_by_group[disk,group] for group in model.groups if group <= self.num_existing_groups) <= max_existing_targets_per_disk
model.existing_targets_evenly_distributed_to_disks_eqn = po.Constraint(model.disks, rule=existing_targets_evenly_distributed_to_disks)
def num_existing_targets_not_moved(model):
return po.quicksum(model.disk_used_by_group[disk,group] for disk in model.disks for group in model.groups if (disk,group) in self.existing_incidence_matrix)
def total_rebalance_traffic(model):
return self.total_existing_targets - num_existing_targets_not_moved(model)
model.obj = po.Objective(expr=total_rebalance_traffic, sense=po.minimize)
return model
def visualize_solution(self, instance, output_path = "output", write_html=True):
incidence_matrix = self.get_incidence_matrix(instance)
disks, groups = zip(*incidence_matrix.keys())
incidence_df = pd.DataFrame(zip(disks, groups, [g > self.num_existing_groups for g in groups]), columns=["disk", "group", "new"])
peer_traffic_map = self.get_peer_traffic(instance)
min_peer_traffic = min(peer_traffic_map.values())
max_peer_traffic = max(peer_traffic_map.values())
fig = px.scatter(
incidence_df,
x="disk",
y="group",
color="new",
title=f"{self}, min/max peer traffic: {min_peer_traffic:.1f}/{max_peer_traffic:.1f}, rebalance traffic: {po.value(instance.obj.expr)}")
fig.update_layout(
xaxis_title="Nodes",
yaxis_title="Groups",
xaxis = dict(
tickmode = 'array',
tickvals = list(range(1, self.num_nodes+1)),
),
yaxis = dict(
tickmode = 'array',
tickvals = list(range(1, self.num_groups+1)),
),
)
if write_html:
fig.write_html(os.path.join(output_path, f"{self.path_name}.html"), include_plotlyjs=True)
return fig
def main():
import psutil
import argparse
parser = argparse.ArgumentParser(prog="model.py", description="3FS data placement")
parser.add_argument("-pyomo", "--pyomo_solver", default="appsi_highs", choices=["appsi_highs", "cbc", "scip"], help="Solver used by Pyomo")
parser.add_argument("-type", "--chain_table_type", type=str, required=True, choices=["CR", "EC"], help="CR - Chain Replication; EC - Erasure Coding")
parser.add_argument("-j", "--solver_threads", type=int, default=psutil.cpu_count(logical=False)//2, help="Number of solver threads")
parser.add_argument("-v", "--num_nodes", type=int, required=True, help="Number of storage nodes")
parser.add_argument("-r", "--num_targets_per_disk", type=int, default=None, help="Number of storage targets on each disk")
parser.add_argument("-min_r", "--min_targets_per_disk", type=int, default=1, help="Min number of storage targets on each disk")
parser.add_argument("-k", "--replication_factor", "--group_size", dest="group_size", type=int, default=3, help="Replication factor or erasure coding group size")
parser.add_argument("-b", "--num_groups", type=int, default=None, help="Number of chains or EC groups")
parser.add_argument("-ql", "--qlinearize", action="store_true", help="Enable linearization of quadratic equations")
parser.add_argument("-lb", "--relax_lb", type=int, default=1, help="Relax the lower bound of peer recovery traffic")
parser.add_argument("-ub", "--relax_ub", type=int, default=0, help="Relax the upper bound of peer recovery traffic")
parser.add_argument("-relax", "--auto_relax", action="store_true", help="Auto relax the lower/upper bound of peer recovery traffic when timeout")
parser.add_argument("-bibd", "--bibd_only", action="store_true", help="Only create balanced incomplete block design (BIBD)")
parser.add_argument("-t", "--init_timelimit", type=int, default=1800, help="Initial timeout for solver")
parser.add_argument("-T", "--max_timelimit", type=int, default=3600*2, help="Max timeout for solver")
parser.add_argument("-o", "--output_path", default="output", help="Path of output files")
parser.add_argument("-m", "--existing_incidence_matrix", default=None, help="Existing incidence matrix for rebalance traffic model")
parser.add_argument("-V", "--verbose", action="store_true", help="Show verbose output")
args = parser.parse_args()
if args.existing_incidence_matrix is None:
DataPlacementModel(
args.chain_table_type,
args.num_nodes,
args.group_size,
args.num_groups,
args.num_targets_per_disk,
args.min_targets_per_disk,
args.bibd_only,
args.qlinearize,
args.relax_lb,
args.relax_ub,
).run(
args.pyomo_solver,
args.solver_threads,
args.init_timelimit,
args.max_timelimit,
args.auto_relax,
args.output_path,
args.verbose)
else:
with open(args.existing_incidence_matrix, "rb") as fin:
existing_incidence_matrix = pickle.load(fin)
RebalanceTrafficModel(
existing_incidence_matrix,
args.chain_table_type,
args.num_nodes,
args.group_size,
args.num_groups,
args.num_targets_per_disk,
args.min_targets_per_disk,
args.bibd_only,
args.qlinearize,
args.relax_lb,
args.relax_ub,
).run(
args.pyomo_solver,
args.solver_threads,
args.init_timelimit,
args.max_timelimit,
args.auto_relax,
args.output_path,
args.verbose)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,108 @@
# local test
# pytest test/test_plan.py -v -x
# production setup
import functools
import socket
import sys
import os.path
import itertools
import pandas as pd
import pyarrow as arrow
from typing import List, Literal
from loguru import logger
from smallpond.common import pytest_running
from smallpond.logical.dataset import ArrowTableDataSet
from smallpond.logical.node import Context, ConsolidateNode, DataSetPartitionNode, DataSourceNode, ArrowComputeNode, LogicalPlan, SqlEngineNode
from smallpond.execution.driver import Driver
from smallpond.execution.task import RuntimeContext, ArrowComputeTask
def solve_model(runtime_task: ArrowComputeTask,
chain_table_type, num_nodes, group_size, min_targets_per_disk,
init_timelimit, max_timelimit,
pyomo_solver="appsi_highs"):
import logging
pyomo_logger = logging.getLogger('pyomo')
pyomo_logger.setLevel(logging.WARNING)
try:
from src.model.data_placement import DataPlacementModel
except:
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from src.model.data_placement import DataPlacementModel
model = DataPlacementModel(chain_table_type, num_nodes, group_size, min_targets_per_disk=min_targets_per_disk, bibd_only=False, qlinearize=True, relax_lb=1, relax_ub=0)
runtime_task.add_elapsed_time("build model time")
instance = model.run(
pyomo_solver=pyomo_solver,
threads=runtime_task.cpu_limit,
init_timelimit=init_timelimit,
max_timelimit=max_timelimit,
auto_relax=True,
output_root=runtime_task.runtime_output_abspath,
add_elapsed_time=runtime_task.add_elapsed_time)
return model, instance
def solve_loop(runtime_ctx: RuntimeContext, input_tables: List[arrow.Table],
init_timelimit, max_timelimit,
pyomo_solver="appsi_highs") -> arrow.Table:
runtime_task = runtime_ctx.task
model_params, = input_tables
output_table = None
schema = arrow.schema([
arrow.field("chain_table_type", arrow.string()),
arrow.field("num_nodes", arrow.uint32()),
arrow.field("group_size", arrow.uint32()),
arrow.field("disks", arrow.list_(arrow.uint32())),
arrow.field("groups", arrow.list_(arrow.uint32())),
])
for chain_table_type, num_nodes, group_size, min_targets_per_disk in zip(*model_params.to_pydict().values()):
model, instance = solve_model(runtime_task, chain_table_type, num_nodes, group_size, min_targets_per_disk, init_timelimit, max_timelimit, pyomo_solver)
incidence_matrix = model.get_incidence_matrix(instance)
disks, groups = zip(*incidence_matrix.keys())
sol_table = arrow.Table.from_arrays([[chain_table_type], [num_nodes], [group_size], [disks], [groups]], schema=schema)
output_table = sol_table if output_table is None else arrow.concat_tables((output_table, sol_table))
return output_table
def search_data_placement_plans(
chain_table_type: Literal["EC", "CR"],
num_nodes: List[int], group_size: List[int], min_targets_per_disk=1,
init_timelimit=1800, max_timelimit=3600*3,
solver_threads: int=64,
pyomo_solver="appsi_highs"):
params = pd.DataFrame([(chain_table_type, v, k, min_targets_per_disk)
for v, k in itertools.product(num_nodes, group_size) if v >= k],
columns=["chain_table_type", "num_nodes", "group_size", "min_targets_per_disk"])
logger.warning(f"params: {params}")
ctx = Context()
params_source = DataSourceNode(ctx, ArrowTableDataSet(arrow.Table.from_pandas(params)))
params_partitions = DataSetPartitionNode(ctx, (params_source,), npartitions=len(params), partition_by_rows=True)
data_placement_sols = ArrowComputeNode(
ctx, (params_partitions,),
process_func=functools.partial(solve_loop, init_timelimit=init_timelimit, max_timelimit=max_timelimit, pyomo_solver=pyomo_solver),
cpu_limit=solver_threads)
return LogicalPlan(ctx, data_placement_sols)
def main():
driver = Driver()
driver.add_argument("-pyomo", "--pyomo_solver", default="appsi_highs", choices=["appsi_highs", "cbc", "scip"], help="Solver used by Pyomo")
driver.add_argument("-type", "--chain_table_type", type=str, required=True, choices=["EC", "CR"], help="CR - Chain Replication; EC - Erasure Coding")
driver.add_argument("-v", "--num_nodes", nargs="+", type=int, required=True, help="Number of storage nodes")
driver.add_argument("-k", "--replication_factor", "--group_size", dest="group_size", type=int, default=3, help="Replication factor or erasure coding group size")
driver.add_argument("-min_r", "--min_targets_per_disk", type=int, default=1, help="Min number of storage targets on each disk")
driver.add_argument("-j", "--solver_threads", type=int, default=32, help="Number of solver threads")
driver.add_argument("-t", "--init_timelimit", type=int, default=1800, help="Initial timeout for solver")
driver.add_argument("-T", "--max_timelimit", type=int, default=3600*3, help="Max timeout for solver")
plan = search_data_placement_plans(num_executors=driver.num_executors, **driver.get_arguments())
driver.run(plan)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,124 @@
import argparse
import os.path
from collections import Counter, defaultdict, namedtuple
import pickle
from typing import Dict, List, Literal, Tuple
Target = namedtuple("Target", ["target_id", "node_id", "disk_index"])
Chain = namedtuple("Chain", ["chain_id", "target_list"])
def calc_target_id(target_id_prefix: int, node_id: int, disk_index: int, target_index: int):
return ((target_id_prefix * 1_000_000 + node_id) * 1_000 + (disk_index+1)) * 100 + (target_index+1)
def generate_chains(
chain_table_type: Literal["EC", "CR"],
node_id_begin: int,
node_id_end: int,
num_disks_per_node: int,
num_targets_per_disk: int,
target_id_prefix: int,
chain_id_prefix: int,
incidence_matrix: Dict[Tuple[int, int], bool],
**kwargs):
num_nodes = node_id_end - node_id_begin + 1
nodes, groups = zip(*sorted(incidence_matrix.keys()))
group_sizes = list(Counter(groups).values())
assert max(nodes) == num_nodes, f"{max(nodes)=} != {num_nodes=}"
assert all(s == group_sizes[0] for s in group_sizes[1:]), f"not all group sizes the same: {group_sizes}"
assert len(incidence_matrix) % group_sizes[0] == 0, f"{len(incidence_matrix)=} % {group_sizes[0]=} != 0"
assert len(incidence_matrix) == num_nodes * num_targets_per_disk, f"{len(incidence_matrix)=} != {num_nodes=} * {num_targets_per_disk=}"
global_target_list = []
chain_target_list = defaultdict(list)
for disk_index in range(num_disks_per_node):
group_slot_idx = defaultdict(int)
for node_id in range(node_id_begin, node_id_end+1):
for target_index in range(num_targets_per_disk):
target_id = calc_target_id(target_id_prefix, node_id, disk_index, target_index)
target_pos = (node_id - node_id_begin) * num_targets_per_disk + target_index
if chain_table_type == "EC":
group_slot_idx[groups[target_pos]] += 1
chain_index = (groups[target_pos]-1) * group_sizes[0] + group_slot_idx[groups[target_pos]]
else:
chain_index = groups[target_pos]
assert chain_index < 1_00_000, f"{chain_index} >= {1_00_000}"
chain_id = (chain_id_prefix * 1_000 + (disk_index+1)) * 1_00_000 + chain_index
target = Target(target_id, node_id, disk_index)
global_target_list.append(target)
chain_target_list[chain_id].append(target)
num_targets_on_node = list(Counter(target.node_id for target in global_target_list).values())
num_targets_on_disk = list(Counter((target.node_id, target.disk_index) for target in global_target_list).values())
assert len(global_target_list) == len(set(global_target_list)) == num_nodes * num_disks_per_node * num_targets_per_disk
assert all(x == num_targets_on_node[0] for x in num_targets_on_node[1:])
assert all(x == num_targets_on_disk[0] for x in num_targets_on_disk[1:])
if chain_table_type == "EC":
assert all(len(target_ids) == 1 for target_ids in chain_target_list.values())
assert len(chain_target_list) == num_nodes * num_disks_per_node * num_targets_per_disk
else:
assert all(len(target_ids) == group_sizes[0] for target_ids in chain_target_list.values())
assert len(chain_target_list) == num_nodes * num_disks_per_node * num_targets_per_disk // group_sizes[0]
return [Chain(chain_id, target_list) for chain_id, target_list in sorted(chain_target_list.items())]
def main():
parser = argparse.ArgumentParser(prog="model.py", description="Generate 3FS create target commands")
parser.add_argument("-type", "--chain_table_type", type=str, required=True, choices=["EC", "CR"], help="CR - Chain Replication; EC - Erasure Coding")
parser.add_argument("-b", "--node_id_begin", type=int, required=True, help="The first node id")
parser.add_argument("-e", "--node_id_end", type=int, required=True, help="The last node id")
parser.add_argument("-d", "--num_disks_per_node", type=int, required=True, help="Number of disk on each storage node")
parser.add_argument("-r", "--num_targets_per_disk", type=int, required=True, help="Number of storage targets on each disk")
parser.add_argument("-tp", "--target_id_prefix", type=int, default=10, help="Prefix of generated target id")
parser.add_argument("-cp", "--chain_id_prefix", type=int, default=10, help="Prefix of generated chain id")
parser.add_argument("-cs", "--chunk_size", nargs="+", help="A list of supported file chunk sizes")
parser.add_argument("-mat", "--incidence_matrix_path", type=str, required=True, help="Incidence matrix generated by data placement model")
parser.add_argument("-o", "--output_path", default="output", help="Path of output files")
args = parser.parse_args()
with open(args.incidence_matrix_path, "rb") as fin:
incidence_matrix = pickle.load(fin)
assert len(incidence_matrix) < 1_00_000
assert args.node_id_end - args.node_id_begin < 1000
assert args.node_id_end < 1_000_000
assert args.node_id_begin < 1_000_000
assert args.num_disks_per_node < 1000
assert args.num_targets_per_disk < 100
assert args.target_id_prefix < 100
assert args.chain_id_prefix < 100
chain_list = generate_chains(**vars(args), incidence_matrix=incidence_matrix)
with open(os.path.join(args.output_path, "generated_chains.csv"), "w") as fout:
print(f"ChainId,{','.join(['TargetId']*len(chain_list[0].target_list))}", file=fout)
for chain in chain_list:
print(f"{chain.chain_id},{','.join(str(target.target_id) for target in chain.target_list)}", file=fout)
with open(os.path.join(args.output_path, "generated_chain_table.csv"), "w") as fout:
print("ChainId", file=fout)
for chain in chain_list:
print(f"{chain.chain_id}", file=fout)
with open(os.path.join(args.output_path, "create_target_cmd.txt"), "w") as fout:
chunk_size_opt = f"--chunk-size {' '.join(args.chunk_size)}" if args.chunk_size else ""
for chain in chain_list:
for target in chain.target_list:
print(f"create-target --node-id {target.node_id} --disk-index {target.disk_index} --target-id {target.target_id} --chain-id {chain.chain_id} {chunk_size_opt} --use-new-chunk-engine", file=fout)
with open(os.path.join(args.output_path, "remove_target_cmd.txt"), "w") as fout:
for chain in chain_list:
for target in chain.target_list:
print(f"offline-target --node-id {target.node_id} --target-id {target.target_id}", file=fout)
print(f"remove-target --node-id {target.node_id} --target-id {target.target_id}", file=fout)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,94 @@
import copy
import glob
import os.path
import importlib
import shutil
import tempfile
import pytest
from src.model.data_placement import DataPlacementModel, RebalanceTrafficModel
placement_params = [
# simple cases for replication group
{
"chain_table_type": "EC",
"num_nodes": 5,
"num_targets_per_disk": 6,
"group_size": 2,
},
{
"chain_table_type": "EC",
"num_nodes": 5,
"num_targets_per_disk": 6,
"group_size": 3,
},
# not all targets used: num_nodes * num_targets_per_disk % group_size != 0
{
"chain_table_type": "EC",
"num_nodes": 7,
"num_targets_per_disk": 5,
"group_size": 4,
},
# always evenly distributed: num_targets_per_disk * (group_size-1) % (num_nodes-1) == 0
{
"chain_table_type": "EC",
"num_nodes": 8,
"num_targets_per_disk": 6,
"group_size": 5,
},
# all targets used & evenly distributed
{
"chain_table_type": "EC",
"num_nodes": 10,
"num_targets_per_disk": 9,
"group_size": 5,
},
]
qlinearize = [False, True]
relax_lb = [1, 2]
@pytest.mark.parametrize('qlinearize', qlinearize[1:])
@pytest.mark.parametrize('relax_lb', relax_lb)
@pytest.mark.parametrize('placement_params', placement_params)
@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
def test_solve_placement_model_with_highs(placement_params, qlinearize, relax_lb):
DataPlacementModel(
**placement_params,
qlinearize=qlinearize,
relax_lb=relax_lb,
).run(pyomo_solver="appsi_highs")
@pytest.mark.parametrize('chain_table_type, num_nodes, group_size', [("CR", 25, 3), ("EC", 25, 20)])
@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
def test_solve_placement_model_v25(chain_table_type, num_nodes, group_size):
model = DataPlacementModel(
chain_table_type=chain_table_type,
num_nodes=num_nodes,
group_size=group_size,
qlinearize=True,
relax_lb=1,
relax_ub=1,
)
model.run(pyomo_solver="appsi_highs", max_timelimit=30, auto_relax=True)
@pytest.mark.parametrize('placement_params', placement_params)
@pytest.mark.skipif(importlib.util.find_spec("highspy") is None, reason="cannot find solver")
def test_solve_rebalance_model(placement_params):
model = DataPlacementModel(
**placement_params,
qlinearize=True,
relax_lb=1,
relax_ub=1,
)
instance = model.run(pyomo_solver="appsi_highs")
placement_params = copy.deepcopy(placement_params)
placement_params["num_nodes"] *= 2
placement_params.pop("num_targets_per_disk")
RebalanceTrafficModel(
existing_incidence_matrix=model.get_incidence_matrix(instance),
**placement_params,
qlinearize=True,
relax_lb=2,
relax_ub=1,
).run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)

View File

@@ -0,0 +1,10 @@
from smallpond.test_fabric import TestFabric
from src.model.data_placement_job import search_data_placement_plans
class TestPlan(TestFabric):
def test_search_data_placement_plans(self):
for pyomo_solver in ["appsi_highs"]:
with self.subTest(pyomo_solver=pyomo_solver):
plan = search_data_placement_plans(chain_table_type="EC", num_nodes=[10], group_size=[5, 9], solver_threads=16, pyomo_solver=pyomo_solver)
self.execute_plan(plan, num_executors=1)

View File

@@ -0,0 +1,55 @@
from collections import Counter
import glob
import os.path
import pytest
from src.model.data_placement import DataPlacementModel
from src.setup.gen_chain_table import generate_chains
@pytest.mark.parametrize('num_nodes, num_disks_per_node, num_targets_per_disk, num_replicas', [(5, 10, 6, 2), (10, 10, 9, 3)])
def test_generate_cr_chains(num_nodes: int, num_disks_per_node: int, num_targets_per_disk: int, num_replicas: int):
model = DataPlacementModel(
chain_table_type="CR",
num_nodes=num_nodes,
num_targets_per_disk=num_targets_per_disk,
group_size=num_replicas,
qlinearize=True,
relax_lb=1,
relax_ub=1,
)
instance = model.run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)
generate_chains(
chain_table_type="CR",
node_id_begin=1,
node_id_end=num_nodes,
num_disks_per_node=num_disks_per_node,
num_targets_per_disk=num_targets_per_disk,
target_id_prefix=1,
chain_id_prefix=9,
incidence_matrix=model.get_incidence_matrix(instance))
@pytest.mark.parametrize('num_nodes, num_disks_per_node, num_targets_per_disk, ec_group_size', [(20, 10, 6, 12), (25, 10, 12, 20)])
def test_generate_ec_chains(num_nodes: int, num_disks_per_node: int, num_targets_per_disk: int, ec_group_size: int):
model = DataPlacementModel(
chain_table_type="EC",
num_nodes=num_nodes,
num_targets_per_disk=num_targets_per_disk,
group_size=ec_group_size,
qlinearize=True,
relax_lb=1,
relax_ub=1,
)
instance = model.run(pyomo_solver="appsi_highs", max_timelimit=15, auto_relax=True)
generate_chains(
chain_table_type="EC",
node_id_begin=1,
node_id_end=num_nodes,
num_disks_per_node=num_disks_per_node,
num_targets_per_disk=num_targets_per_disk,
target_id_prefix=1,
chain_id_prefix=9,
incidence_matrix=model.get_incidence_matrix(instance))

View File

@@ -0,0 +1,51 @@
CREATE DATABASE IF NOT EXISTS 3fs;
CREATE TABLE IF NOT EXISTS 3fs.counters (
`TIMESTAMP` DateTime CODEC(DoubleDelta),
`metricName` LowCardinality(String) CODEC(ZSTD(1)),
`host` LowCardinality(String) CODEC(ZSTD(1)),
`tag` LowCardinality(String) CODEC(ZSTD(1)),
`val` Int64 CODEC(ZSTD(1)),
`mount_name` LowCardinality(String) CODEC(ZSTD(1)),
`instance` String CODEC(ZSTD(1)),
`io` LowCardinality(String) CODEC(ZSTD(1)),
`uid` LowCardinality(String) CODEC(ZSTD(1)),
`pod` String CODEC(ZSTD(1)),
`thread` LowCardinality(String) CODEC(ZSTD(1)),
`statusCode` LowCardinality(String) CODEC(ZSTD(1))
)
ENGINE = MergeTree
PRIMARY KEY (metricName, host, pod, instance, TIMESTAMP)
PARTITION BY toDate(TIMESTAMP)
ORDER BY (metricName, host, pod, instance, TIMESTAMP)
TTL TIMESTAMP + toIntervalMonth(1)
SETTINGS index_granularity = 8192;
CREATE TABLE IF NOT EXISTS 3fs.distributions (
`TIMESTAMP` DateTime CODEC(DoubleDelta),
`metricName` LowCardinality(String) CODEC(ZSTD(1)),
`host` LowCardinality(String) CODEC(ZSTD(1)),
`tag` LowCardinality(String) CODEC(ZSTD(1)),
`count` Float64 CODEC(ZSTD(1)),
`mean` Float64 CODEC(ZSTD(1)),
`min` Float64 CODEC(ZSTD(1)),
`max` Float64 CODEC(ZSTD(1)),
`p50` Float64 CODEC(ZSTD(1)),
`p90` Float64 CODEC(ZSTD(1)),
`p95` Float64 CODEC(ZSTD(1)),
`p99` Float64 CODEC(ZSTD(1)),
`mount_name` LowCardinality(String) CODEC(ZSTD(1)),
`instance` String CODEC(ZSTD(1)),
`io` LowCardinality(String) CODEC(ZSTD(1)),
`uid` LowCardinality(String) CODEC(ZSTD(1)),
`method` LowCardinality(String) CODEC(ZSTD(1)),
`pod` String CODEC(ZSTD(1)),
`thread` LowCardinality(String) CODEC(ZSTD(1)),
`statusCode` LowCardinality(String) CODEC(ZSTD(1))
)
ENGINE = MergeTree
PRIMARY KEY (metricName, host, pod, instance, TIMESTAMP)
PARTITION BY toDate(TIMESTAMP)
ORDER BY (metricName, host, pod, instance, TIMESTAMP)
TTL TIMESTAMP + toIntervalMonth(1)
SETTINGS index_granularity = 8192;

View File

@@ -0,0 +1,12 @@
[Unit]
Description=fuse_main Server
Requires=network-online.target
After=network-online.target
[Service]
LimitNOFILE=1000000
ExecStart=/opt/3fs/bin/hf3fs_fuse_main --launcher_cfg /opt/3fs/etc/hf3fs_fuse_main_launcher.toml
Type=simple
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,12 @@
[Unit]
Description=meta_main Server
Requires=network-online.target
After=network-online.target
[Service]
LimitNOFILE=1000000
ExecStart=/opt/3fs/bin/meta_main --launcher_cfg /opt/3fs/etc/meta_main_launcher.toml --app-cfg /opt/3fs/etc/meta_main_app.toml
Type=simple
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,12 @@
[Unit]
Description=mgmtd_main Server
Requires=network-online.target
After=network-online.target
[Service]
LimitNOFILE=1000000
ExecStart=/opt/3fs/bin/mgmtd_main --launcher_cfg /opt/3fs/etc/mgmtd_main_launcher.toml --app-cfg /opt/3fs/etc/mgmtd_main_app.toml
Type=simple
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,11 @@
[Unit]
Description=monitor_collector_main Server
Requires=network-online.target
After=network-online.target
[Service]
ExecStart=/opt/3fs/bin/monitor_collector_main --cfg /opt/3fs/etc/monitor_collector_main.toml
Type=simple
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,14 @@
[Unit]
Description=storage_main Server
Requires=network-online.target
After=network-online.target
[Service]
LimitNOFILE=1000000
LimitMEMLOCK=infinity
TimeoutStopSec=5m
ExecStart=/opt/3fs/bin/storage_main --launcher_cfg /opt/3fs/etc/storage_main_launcher.toml --app-cfg /opt/3fs/etc/storage_main_app.toml
Type=simple
[Install]
WantedBy=multi-user.target

6
docs/README.md Normal file
View File

@@ -0,0 +1,6 @@
# Documentation
* [Design Notes](<design notes.md>)
* [Setup Guide](../deploy/README.md)
* [USRBIO API Reference](../src/lib/api/UsrbIo.md)
* [P Specifications](../specs/README.md)

290
docs/design_notes.md Normal file
View File

@@ -0,0 +1,290 @@
# Design Notes
## Design and implementation
The 3FS system has four components: cluster manager, metadata service, storage service and client. All components are connected in a RDMA network (InfiniBand or RoCE).
Metadata and storage services send heartbeats to cluster manager. Cluster manager handles membership changes and distributes cluster configuration to other services and clients. Multiple cluster managers are deployed and one of them is elected as the primary. Another manager is promoted as primary when the primary fails. Cluster configuration is typically stored in a reliable distributed coordination service, such as ZooKeeper or etcd. In our production environment, we use the same key-value store as file metadata to reduce dependencies.
File metadata operations (e.g. open or create files/directories) are sent to metadata services, which implement the file system semantics. Metadata services are stateless, since file metadata are stored in a transactional key-value store (e.g. FoundationDB). Clients can connect to any metadata service.
Each storage service manages a few local SSDs and provides a chunk store interface. The storage service implements Chain Replication with Apportioned Queries (CRAQ) to ensure strong consistency. CRAQs write-all-read-any approach helps to unleash the throughput of SSDs and RDMA network. A 3FS file is split into equally sized chunks, which are replicated over multiple SSDs.
Two clients are developed for applications: FUSE client and native client. Most applications use FUSE client, which has a low adoption barrier. Performance-critical applications are integrated with the native client.
## File system interfaces
Object store is becoming a popular option for data analytics and machine learning. However, file system semantics and a unified namespace where files are organized in directories provide greater flexibility for applications.
- *Atomic directory manipulation* An object store can approximate hierarchical directory structures by using slashes (/) in object keys. However, it doesnt natively support operations like atomically moving files/directories, or recursively deleting entire directories. Actually a common pattern in our internal applications involves creating a temporary directory, writing files to it, and then moving the directory to its final location. When handling a large number of small files, the recursive delete for directories is crucial. Without it, applications have to traverse each directory and remove files one by one.
- *Symbolic and hard links* Our applications utilize symbolic and hard links to create lightweight snapshots of dynamically updated datasets, where new data is appended as individual files.
- *Familiar interface* The file interface is well known and used everywhere. There is no need to learn a new storage API. Many datasets are stored as CSV/Parquet files. Adapting file-based data loaders to use the 3FS FUSE client or native client is straightforward.
### Limitations of FUSE
FUSE (Filesystem in Userspace) simplifies file system client development by redirecting I/O operations to user-space processes through the FUSE kernel module. It creates the illusion that applications are accessing the remote file system as if it were a local file system. However, it has performance limitations:
- *Memory copy overhead* The user-space file system daemon cannot access application memory. Data transfer between kernel and user spaces consumes memory bandwidth and increases end-to-end latency.
- *Primitive multi-threading support* When an application initiates I/O requests, FUSE places these requests into a multi-threaded shared queue, protected by a spin lock. The user-space file system daemon then retrieves and processes requests from this queue. Due to lock contention, FUSEs I/O processing capability fails to scale with the number of threads. Our benchmark results indicate that FUSE only handles approximately 400K 4KiB reads per second. Further increasing concurrency does not improve performance as lock contention intensifies. `perf` profiling reveals that the kernel-space spin lock consumes a significant amount of CPU time.
Most applications, e.g. data analytics, perform large block writes on 3FS or they can buffer data in memory and flush it to 3FS when write buffer is full. However, FUSE on Linux 5.x does not support concurrent writes to the same file[^1]. Applications overcome this limitation by writing to multiple files concurrently, maximizing the total throughput.
Read operations exhibit more complex patterns. Some training jobs require random access to dataset samples, with read sizes varying from a few kilobytes to several megabytes per sample. And samples are typically not 4K-aligned in files. Data loaders are specifically designed to fetch batches of samples. But they performs poorly when handling small random reads on FUSE-mounted 3FS. Bandwidth of SSDs and RDMA network are not fully utilized.
### Asynchronous zero-copy API
Implementing the file system client as a VFS kernel module avoids performance issues mentioned above. But kernel module development is significantly more challenging than user-space system programming. Bugs are difficult to diagnose and can lead to catastrophic failures in production environments. For example, machines may crash and leave no log message for debugging. When upgrading a kernel module, all processes using the file system must be stopped cleanly; otherwise, a machine restart is required.
For these reasons, we have chosen to implement a native client within the FUSE daemon. This client offers an interface that supports asynchronous zero-copy I/O operations. File meta operations are still handled by FUSE daemon (e.g. open/close/stat files). Applications call `open()` to obtain a file descriptor (fd) and register it via native API. They can then perform I/O operations on the file with native client. This approach ensures consistency in metadata operations with the POSIX API, making it easier to migrate existing code.
The asynchronous, zero-copy API is inspired by Linux `io_uring`. Below are the key data structures in the API:
- *Iov* A large memory region for zero-copy read/write operations, shared between the user process and the native client. InfiniBand memory registration is managed by the client. In native API, all read data will be read into Iov, and all write data should be written to Iov before calling the API.
- *Ior* A small shared ring buffer for communication between user process and native client. The usage of Ior is similar to Linux `io_uring`, where the user process enqueues read/write requests, and the native client dequeues these requests for completion. The requests are executed in batches, with their sizes controlled by the `io_depth` parameter. Multiple batches are processed in parallel, whether from different rings or the same ring. However, multiple rings are still recommended for multi-threaded applications, as sharing a ring requires synchronization, which can impact performance.
Within the native client, multiple threads are spawned to fetch I/O requests from the Iors. These requests are batched and dispatched to storage services, reducing RPC overhead caused by small read requests.
## File metadata store
### Location of file chunks
3FS divides file data into equally sized chunks and stripes them across multiple replication chains (replication chains and chain tables are defined in Section [Data placement](#data-placement). Users can specify the chain table, chunk size, and stripe size for files on a per-directory basis. Each chunk is independently stored on multiple storage services, with its chunk ID generated by concatenating the files inode id and chunk index.
When creating a new file, the metadata service employs a round-robin strategy to select consecutive replication chains from the designated chain table, based on the stripe size. Next, a random seed is generated to shuffle the selected chains. This allocation strategy ensures balanced data distribution across chains and SSDs.
When an application opens a file, the client contacts the meta service to obtain the files data layout information. Then the client can independently compute chunk IDs and chains for data operations, minimizing the involvement of the meta service in the critical path.
### File metadata on transactional key-value store
3FS uses FoundationDB as its distributed storage system for metadata. FoundationDB provides a key-value store interface and supports transactions with Serializable Snapshot Isolation (SSI). 3FS stores all metadata as key-value pairs in FoundationDB. Meta services follow a stateless architecture, greatly enhancing maintainability by allowing administrators to seamlessly upgrade or restart services without disruption. When clients experience request failures or timeouts, they can automatically fail over to other available services.
The file system metadata primarily consists of two core structures: inodes and directory entries. Inodes store attribute information for files, directories, and symbolic links, each identified by a globally unique 64-bit identifier that increments monotonically. Inode keys are constructed by concatenating the "INOD" prefix with the inode id, which is encoded in little-endian byte order to spread inodes over multiple FoundationDB nodes. The inode values vary by its type:
- All inode types contain basic attributes: ownership, permissions, access/modification/change times.
- Additional attributes for file inodes: file length, chunk size, selected range in chain table, shuffle seed.
- Additional attributes for directory inodes: the parent directorys inode id, default layout configurations for subdirectories/files (chain table, chunk size, stripe size). The parents inode id is required to detect loops when moving directories. When moving `dir_a/dir_b` to `dir_c/`, we need to ensure that `dir_c` is not a descendant of `dir_b`, which can be achieved by checking all ancestors of `dir_c` upward.
- Additional attributes for symbolic link inodes: target path string.
Directory entry keys are composed of a "DENT" prefix, the parent inode ID, and the entry name. Directory entry values store the target inode id and inode type. All entries within a directory naturally form a contiguous key range, allowing efficient directory listing via range queries.
The meta operations leverages FoundationDBs transactions:
- Read-only transactions used for metadata queries: fstat, lookup, listdir etc.
- Read-write transactions used for metadata updates: create, link, unlink, rename etc.
For write transactions, FoundationDB tracks the read/write key sets to form conflict detection sets. When concurrent transaction conflicts are detected, the meta service automatically retries the transaction. This design enables multiple meta services to process requests in parallel while maintaining file system metadata consistency.
### Dynamic file attributes
On most local file systems, deleting an opened file is deferred until all associated file descriptors are closed. Consequently, it is necessary to track all file descriptors of the file. Training jobs open a large number of files during startup. Storing all file descriptors would impose heavy load on meta service and FoundationDB. Since training jobs do not depend on this feature, 3FS does not track file descriptors opened in read-only mode.
3FS maintains a file session for each file descriptor (fd) opened in write mode since deleting write opened files may lead to unreclaimable garbage chunks from concurrent writes. When a file with active write sessions is deleted, meta service delays the deletion until all its fds are closed. To prevent lingering sessions from offline clients, the 3FS meta service periodically checks client liveness and cleans up sessions of offline clients.
The file length is stored in the inode. For files being actively updated, the length stored in inode may diverge from the actual length. Clients periodically (5 seconds by default) report to meta service maximum write position of each file opened in write mode. If this position exceeds the length in inode and there is no concurrent truncate operation, this position is adopted as the new file length.
Due to the possibility of concurrent writes from multiple clients, the method described above ensures only eventual consistency for file lengths. When processing close/fsync operations, the meta service obtains the precise file length by querying the ID and length of the last chunk from the storage service. Since file data is striped across multiple chains, this operation incurs non-negligible overhead.
Concurrent updates to the same files length by multiple meta services may cause transaction conflicts and lead to repeated file length computation. To mitigate this, meta service distributes file length update tasks across multiple meta services using inode IDs and the rendezvous hash algorithm.
Our production environments use a large stripe size: 200. For small files, the number of chains containing file chunks is well below this number. The number of potentially used chains is stored in file inode and used as a hint when updating the length. It starts with an initial value of 16 and is doubled each time additional file chunks are written to more chains. This allows us to avoid querying all 200 chains when updating lengths of small files. This optimization can also be extended to the deletion of small files.
## Chunk storage system
The design goal of chunk storage system is to achieve the highest bandwidth possible even when there are storage medium failures. The read/write throughput of 3FS should scale linearly with the number of SSDs and bisection network bandwidth between clients and storage services. Applications access storage services in a locality-oblivious manner.
### Data placement
Each file chunk is replicated over a chain of storage targets using chain replication with apportioned queries (CRAQ). In CRAQ write requests are sent to the head target and propagated along a chain. Read requests can be sent to any of the storage target. Usually the read traffic is evenly distributed among all targets in a chain for better load balance. Multiple storage targets are created on each SSD and the targets join different chains.
Suppose there are 6 nodes: A, B, C, D, E, F. Each node has 1 SSD. Create 5 storage targets on each SSD: 1, 2, ... 5. Then there are 30 targets in total: A1, A2, A3, ..., F5. If each chunk has 3 replicas, a chain table is constructed as follows.
| Chain | Version | Target 1 (head) | Target 2 | Target 3 (tail) |
| :---: | :-----: | :-------------: | :------: | :-------------: |
| 1 | 1 | `A1` | `B1` | `C1` |
| 2 | 1 | `D1` | `E1` | `F1` |
| 3 | 1 | `A2` | `B2` | `C2` |
| 4 | 1 | `D2` | `E2` | `F2` |
| 5 | 1 | `A3` | `B3` | `C3` |
| 6 | 1 | `D3` | `E3` | `F3` |
| 7 | 1 | `A4` | `B4` | `C4` |
| 8 | 1 | `D4` | `E4` | `F4` |
| 9 | 1 | `A5` | `B5` | `C5` |
| 10 | 1 | `D5` | `E5` | `F5` |
Each chain has a version number. The version number is incremented if the chain is changed (e.g. a storage target is offline). Only the primary cluster manager makes changes to chain tables.
A few chain tables can be constructed to support different data placement requirements. For example, two chain tables can be created, one for batch/offline jobs and another for online services. The two tables consist of storage targets on mutually exclusive nodes and SSDs.
Logically, the state of each chain changes independently. Each chain can be included in multiple chain tables. The concept of chain table is created to let metadata service pick a table for each file and stripe file chunks across chains in the table.
### Balanced traffic during recovery
Suppose read traffic is evenly distributed among all storage targets in the above chain table. When A fails its read requests would be redirected to B and C. Under heavy load the read bandwidth of B, C is immediately saturated and B, C become the bottleneck of the entire system. Replacing a failed SSD and syncing data to the new SSD can take several hours. The read throughput is impaired during this period.
To reduce the performance impact, we can have more SSDs share the redirected traffic. In the following chain table, A is paired with every other SSDs. When A fails, each of the other SSDs receives 1/5 of As read traffic.
| Chain | Version | Target 1 (head) | Target 2 | Target 3 (tail) |
| :---: | :-----: | :-------------: | :------: | :-------------: |
| 1 | 1 | `B1` | `E1` | `F1` |
| 2 | 1 | `A1` | `B2` | `D1` |
| 3 | 1 | `A2` | `D2` | `F2` |
| 4 | 1 | `C1` | `D3` | `E2` |
| 5 | 1 | `A3` | `C2` | `F3` |
| 6 | 1 | `A4` | `B3` | `E3` |
| 7 | 1 | `B4` | `C3` | `F4` |
| 8 | 1 | `B5` | `C4` | `E4` |
| 9 | 1 | `A5` | `C5` | `D4` |
| 10 | 1 | `D5` | `E5` | `F5` |
To achieve maximum read throughput during recovery, the load balance problem can be formulated as a balanced incomplete block design. The optimal solution is obtained by using integer programming solver.
### Data replication
CRAQ is a write-all-read-any replication protocol optimized for read-heavy workloads. Utilizing read bandwidth of all replicas is critical to achieve highest read throughput in an all-flash storage system.
When a write request is received by a storage service, it goes through the following steps:
1. The service checks if the chain version in write request matches with the latest known version; reject the request if its not. The write request could be sent by a client or a predecessor in the chain.
2. The service issues RDMA Read operations to pull write data. If the client/predecessor fails, the RDMA Read operations may time out and the write is aborted.
3. Once the write data is fetched into local memory buffer, a lock for the chunk to be updated is acquired from a lock manager. Concurrent writes to the same chunk are blocked. All writes are serialized at the head target.
4. The service reads the committed version of the chunk into memory, applies the update, and stores the updated chunk as a pending version. A storage target may store two versions of a chunk: a committed version and a pending version. Each version has a monotonically-increasing version number. The version numbers of committed version and pending versions are `v` and `u` respectively, and satisfy `u = v + 1`.
5. If the service is the tail, the committed version is atomically replaced by the pending version and an acknowledgment message is sent to the predecessor. Otherwise, the write request is forwarded to the successor. When the committed version is updated, the current chain version is stored as a field in the chunk metadata.
6. When an acknowledgment message arrives at a storage service, the service replaces the committed version with the pending version and continues to propagate the message to its predecessor. The local chunk lock is then released.
Suppose there are 3 targets in the chain: `A, B, C`. A write request has just entered step 5 at `A`. `A` forwards the request to successor `B`. Then `B` instantly fails and the forwarded write request is lost. When cluster manager detects `B`s failure, it marks `B` as offline and moves it to the end of chain and broadcasts the updated chain table. Once `A` receives the latest chain table, it forwards the write request to the new successor `C`. `C` may not receive the latest chain table yet and rejects the request. But `A` can keep forwarding the request to `C`. Eventually `C` gets the latest chain table and accepts the request.
When a read request arrives at a storage service:
1. When the service only has a committed version of the chunk, this version is returned to the client.
2. Unlike CRAQ, our implementation does not issue version query to the tail target. When there are both committed and pending versions, the service replies a special status code to notify the client. The client may wait for a short interval and retry. Or the client can issue a relaxed read request to get the pending version.
### Failure detection
The cluster manager relies on heartbeats to detect fail-stop failures. Cluster manager declares a service failed if it does not receive heartbeats from it for a configurable interval (e.g. T seconds). A service stop processing requests and exits if it cannot communicate with cluster manager for T/2 seconds. The heartbeat can be seen as a request to \*renew a lease\* granted by the manager.
The metadata services are stateless. The list of online meta services provided by cluster manager is a simple service discovery mechanism that helps clients create connections to metadata services. If one meta service is down, the clients may switch to any other metadata service.
Cluster manager plays a more critical role in membership changes of storage services. It maintains a global view of chain tables and storage targets states. Each storage target has a public state and a local state.
Public state indicates if its ready to serve read requests and if write requests would be propagated to it. Public states are stored with chain tables and distributed to services and clients.
| Public State | Read | Write | Notes |
| :----------- | :--: | :---: | :---------------------------------------------- |
| serving | Y | Y | service alive and serving client requests |
| syncing | N | Y | service alive and data recovery is in progress |
| waiting | N | N | service alive and data recovery not started yet |
| lastsrv | N | N | service down and it was the last serving target |
| offline | N | N | service down or storage medium failure |
Local state is only known by storage services and cluster manager, and its stored in the memory of cluster manager. If a storage target has medium failure, the related service sets the targets local state to offline in heartbeat. If a storage service is down, storage targets managed by the service are marked offline.
| Local State | Notes |
| :---------- | :--------------------------------------------------- |
| up-to-date | service alive and serving client requests |
| online | service alive and target in syncing or waiting state |
| offline | service down or storage medium failure |
A storage target can change from one public state to another in response to the latest local state. The local state plays the role of a triggering event. The cluster manager periodically scans every chain and updates the public states of targets on the chain according to a state-transition table.
- The chain version is incremented if the chain is updated.
- If a storage target is marked offline, its moved to the end of chain.
- If a storage service finds public state of any local storage target is lastsrv or offline, it exits immediately. The service may be isolated from the cluster manager by network partition error.
- Once the date recovery of a storage target in syncing state is completed, the storage service set the targets local state to up-to-date in subsequent heartbeat messages sent to cluster manager.
| Local State | Current Public State | Predecessors Public State | Next Public State |
| :---------- | :------------------- | :------------------------- | :---------------- |
| up-to-date | serving | (any) | serving |
| | syncing | (any) | serving |
| | waiting | (any) | waiting |
| | lastsrv | (any) | serving |
| | offline | (any) | waiting |
| online | serving | (any) | serving |
| | syncing | serving | syncing |
| | | not serving | waiting |
| | waiting | serving | syncing |
| | | not serving | waiting |
| | lastsrv | (any) | serving |
| | offline | (any) | waiting |
| offline | serving | has no predecessor | lastsrv |
| | | has predecessor | offline |
| | syncing | (any) | offline |
| | waiting | (any) | offline |
| | lastsrv | (any) | lastsrv |
| | offline | (any) | offline |
### Data recovery
When a storage service exits (e.g. process crashes or restarts during upgrade), or a storage medium failure occurs, all related storage targets will be marked as offline and moved to the end of chains by cluster manager. Once the service restarts, each target on the service enters into the recovery process independently. The entire recovery process overlaps with normal activity and minimizes any interruption.
When a previously offline storage service starts:
1. The service periodically pulls latest chain tables from cluster manager. But it does not send heartbeats until all its storage targets have been marked offline in the latest chain tables. This ensures all its targets would go through the data recovery process.
2. When a write request arrives during recovery, the request is always a full-chunk-replace write. The local committed version is updated and any existing pending version is abandoned. Since current service is the tail, an acknowledgment message is sent to the predecessor. The full state of the predecessor is copied to the returning service through a continuous stream of full-chunk-replace writes.
3. Before the data recovery of a storage target starts, the predecessor sends a dump-chunkmeta request to the returning service. Then the service iterates the local chunk metadata store to collect the ids, chain versions and committed/pending version numbers of all chunks on the target, and replies the collected metadata to the predecessor.
4. When a sync-done message arrives, the service knows that the storage target is up-to-date. It sets local state of the target to up-to-date in heartbeat messages sent to cluster manager.
When a storage service finds a previously offline successor is online:
1. The service starts to forward normal write requests to the successor. Clients may only update a portion of the chunk, but the forwarded write requests should contains the whole chunk, i.e. a full-chunk-replace write.
2. The service sends a dump-chunkmeta request to the successor. Once the metadata of all chunks on the successor target are received, it collects the chunk metadata on its local target. Then it compares the two copies of chunk metadata to decide which chunks should be transferred.
3. The selected chunks are transferred to the successor by issuing full-chunk-replace write requests.
- The chunk lock is first acquired for each chunk.
- The chain version, commited version number and chunk content are read and transferred to successor by sending a full-chunk-replace request.
- The chunk lock is released.
4\. When all required chunks have been transferred, a sync-done message is sent to the successor.
The rules used to decide which chunks should be transferred are:
- If a chunk only exists on the local target, it should be transferred.
- If a chunk only exists on the remote target, it should be removed.
- If the chain version of local chunk replica is greater than that of the remote chunk replica, it should be transferred.
- If the chain versions of local/remote chunk replicas are the same but local committed version number does not equal to the remote pending version number, it should be transferred.
- Otherwise, two chunk replicas are either the same or being updated by in-progress write requests.
### Chunks and the metadata
File chunks are stored in the chunk engine. On each SSD, the persistent storage of the chunk engine consists of a fixed number of data files for storing chunk data, and a RocksDB instance for maintaining chunk metadata and other system information. Additionally, the chunk engine maintains an in-memory cache of chunk metadata to enhance query performance. A chunk allocator is implemented for fast allocation of new chunks. The chunk engine interface provides thread-safe access through the following operations:
1. *open/close* Initializes the engine by loading metadata from RocksDB and reconstructing chunk allocator states.
2. get: Retrieves chunk metadata and reference-counted handle through a hashmap cache, enabling concurrent access with O(1) average complexity.
3. *update* Implements copy-on-write (COW) semantics by allocating new chunks before modifying data. Old chunks remain readable until all handles are released.
4. *commit* Commit the updated chunk metadata to RocksDB via write batches to ensure atomic updates; synchronously refresh the chunk metadata cache.
The chunk data will ultimately be stored on physical blocks. Physical block sizes range from 64KiB to 64MiB in increments of powers of two, totaling 11 distinct sizes. The allocator will assign physical blocks whose sizes most closely match the actual chunk size. A resource pool is constructed for each physical block size, with each pool containing 256 physical files. The usage status of physical blocks is maintained in memory using bitmaps. When a physical block is reclaimed, its bitmap flag is set to 0. The actual storage space of the block remains preserved and will be prioritized for subsequent allocations. When no available physical blocks remain, `fallocate()` will be used to allocate a contiguous large space in physical files, creating 256 new physical blocks - this approach helps reduce disk fragmentation.
When performing write operations on a chunk, the allocator first assigns a new physical block. The system then reads existing chunk data into a buffer, apply the update, and writes the updated buffer to the newly allocated block. An optimized process is implemented for appends, where data is directly added in-place at the end of the existing block. A new copy of metadata is constructed from the new block's location and existing chunk metadata. Subsequently, both the new chunk metadata and statuses of new and old physical blocks are atomically updated in RocksDB.
[^1]: https://elixir.bootlin.com/linux/v5.4.284/source/fs/fuse/file.c#L1573

Binary file not shown.

After

Width:  |  Height:  |  Size: 523 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 658 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 341 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 257 KiB

310
hf3fs/__init__.py Normal file
View File

@@ -0,0 +1,310 @@
from hf3fs_py_usrbio import Client, iovec
#import attrs
from contextlib import contextmanager, AbstractContextManager
from dataclasses import dataclass
import functools
import os
from pathlib import PurePosixPath
import threading as th
from pkg_resources import get_distribution
try:
__version__ = get_distribution('hf3fs').version
except:
__version__ = "debug"
DEFAULT_CLIENT = th.local()
DEFAULT_CLIENT.client = None
DEFAULT_CLIENT.clients = {}
#@attrs.define
@dataclass
class MountInfo:
token: str
as_super: bool
MOUNT_INFO = {}
def _getDefaultClient(kwargs):
if 'client' in kwargs and kwargs['client'] is not None:
client = kwargs['client']
del kwargs['client']
elif 'mount_name' in kwargs and kwargs['mount_name'] is not None:
mount_name = kwargs['mount_name']
if mount_name not in DEFAULT_CLIENT.clients:
setupDefaultClient(mount_name)
client = DEFAULT_CLIENT.clients[mount_name]
del kwargs['mount_name']
elif DEFAULT_CLIENT.client is None:
raise RuntimeError("default client not setup")
else:
client = DEFAULT_CLIENT.client
return client, kwargs
def _setupH3Method(name):
@functools.wraps(getattr(Client, name))
def wrapper(*args, **kwargs):
nonlocal name
client, kwargs = _getDefaultClient(kwargs)
return getattr(client, name)(*args, **kwargs)
globals()[name] = wrapper
for _name in ['stat', 'fstat', 'mkdir', 'rmdir', 'unlink', 'remove', 'realpath', 'readlink', 'opendir', 'readdir',
'creat', 'symlink', 'link', 'open', 'close', 'chmod', 'chown', 'chdir', 'ftruncate',
'iovalloc', 'iovfree', 'preadv', 'pwritev']:
_setupH3Method(_name)
def setMountInfo(mount_name, token, as_super=False):
global MOUNT_INFO
MOUNT_INFO[mount_name] = MountInfo(token, as_super)
def setupDefaultClient(mount_name):
global MOUNT_INFO
if mount_name not in MOUNT_INFO:
raise ValueError(f"unknown mount name '{mount_name}'")
mount = MOUNT_INFO[mount_name]
global DEFAULT_CLIENT
client = DEFAULT_CLIENT.clients[mount_name] = Client(mount_name, mount.token, as_super=mount.as_super)
return client
@contextmanager
def defaultClient(mount_name, token, as_super=False):
global DEFAULT_CLIENT
lastClient = DEFAULT_CLIENT.client
DEFAULT_CLIENT.client = Client(mount_name, token, as_super=as_super)
try:
yield DEFAULT_CLIENT.client
finally:
DEFAULT_CLIENT.client = lastClient
def withClient(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
nonlocal f
client, kwargs = _getDefaultClient(kwargs)
return f(client=client, *args, **kwargs)
return wrapper
@withClient
def listdir(path='.', client=None):
if path is None:
path = '.'
dirp = client.opendir(path)
# print(dirp, flush=True)
fileList = []
while True:
dent = client.readdir(dirp)
# print('dent', dent, flush=True)
if dent is None:
break
fileList.append(dent.d_name)
return fileList
class DirEntry(object):
@withClient
def __init__(self, parentPath, name, etype, parentFd, client=None):
self._parentPath = PurePosixPath(parentPath)
self._name = name
self._etype = etype
self._parentFd = parentFd
self._client = client
self._st = None
@property
def name(self):
return self._name
@property
def path(self):
return str(self._parentPath / self._name)
_S_IFMT = 0o170000
_S_IFLNK = 0o120000
_S_IFREG = 0o100000
_S_IFDIR = 0o040000
_DT_DIR = 4
_DT_REG = 8
_DT_LNK = 10
def _checkWFollow(self, against, follow_symlinks):
if self._etype == against[0]:
return True
elif follow_symlinks and self.is_symlink():
st = self.stat(True)
return (st.st_mode & self._S_IFMT) == against[1]
else:
return False
def is_dir(self, follow_symlinks=True):
return self._checkWFollow((self._DT_DIR, self._S_IFDIR), follow_symlinks)
def is_file(self, follow_symlinks=True):
return self._checkWFollow((self._DT_REG, self._S_IFREG), follow_symlinks)
def is_symlink(self):
return self._etype == self._DT_LNK
def stat(self, follow_symlinks=True):
if self._st is None:
self._st = self._client.stat(self._name, dir_fd=self._parentFd, follow_symlinks=follow_symlinks)
return self._st
@withClient
def scandir(path='.', dir_fd=None, client=None):
class DirEntryIter(AbstractContextManager):
def __init__(self, path, client, dir_fd):
self._path = path
self._client = client
self._dir_fd = dir_fd
self._fd = None
self._dirp = None
@property
def dir_fd(self):
return self._fd
def close(self):
if self._fd is not None:
self._client.close(self._fd)
self._fd = None
self._dirp = None
def __iter__(self):
self._fd = self._client.open(self._path, os.O_DIRECTORY | os.O_PATH, dir_fd=self._dir_fd)
# print('dirfd to scan', self._fd, flush=True)
self._dirp = self._client.opendir(self._path, dir_fd=self._dir_fd)
return self
def __next__(self):
dent = self._client.readdir(self._dirp)
if dent is None:
raise StopIteration()
return DirEntry(self._path, dent.d_name, dent.d_type, self._fd, client=self._client)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
return False
if path is None:
path = '.'
return DirEntryIter(path, client, dir_fd)
@withClient
def walk2(top, topdown=True, onerror=None, followlinks=False, dir_fd=None, curr_dir=None, client=None):
try:
with scandir(curr_dir if curr_dir is not None else top, dir_fd, client=client) as sd:
dirnames = []
filenames = []
for dent in sd:
name = dent.name
if dent.is_dir(followlinks):
dirnames.append(name)
if not topdown:
yield from walk2(dent.path, False, onerror, followlinks, sd.dir_fd, name, client=client)
else:
filenames.append(name)
yield (top, dirnames, filenames, sd.dir_fd)
if topdown:
topp = PurePosixPath(top)
for dirname in dirnames:
yield from walk2(str(topp / dirname), True, onerror, followlinks, sd.dir_fd, dirname, client=client)
except OSError as e:
if onerror is not None:
onerror(e)
@withClient
def walk(top, topdown=True, onerror=None, followlinks=False, dir_fd=None, client=None):
for dp, dns, fns, dfd in walk2(top, topdown, onerror, followlinks, dir_fd, None, client=client):
yield (dp, dns, fns)
class BinaryFile(AbstractContextManager):
@withClient
def __init__(self, path, mode, dir_fd=None, client=None, ignore_cache=False):
self.client = client
if mode == 'r':
flags = os.O_RDONLY
elif mode == 'r+':
flags = os.O_RDWR
elif mode == 'r+c':
flags = os.O_RDWR | os.O_CREAT
elif mode == 'w':
flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
elif mode == 'w+':
flags = os.O_RDWR | os.O_CREAT | os.O_TRUNC
else:
raise ValueError(f'invalid mode {mode}')
if ignore_cache:
flags |= os.O_NONBLOCK
self._fd = None
self._off = 0
self._fd = client.open(path, flags, 0o644, dir_fd=dir_fd)
def __del__(self):
self.close()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def close(self):
# print('to close fd', self._fd, flush=True)
if self._fd is not None:
self.client.close(self._fd)
self._fd = None
def fileno(self):
return self._fd
def seek(self, pos, how=os.SEEK_SET, readahead=None):
self._off = self.client.lseek(self._fd, pos, how, readahead=readahead)
return self._off
def tell(self):
return self._off
def _bytesLeft(self):
off = self._off
flen = self.seek(0, os.SEEK_END)
self.seek(off)
return flen - off
def read(self, size=None, readahead=None):
if size is None:
size = self._bytesLeft()
buf = memoryview(bytearray(size))
red = self.readinto(buf, readahead=readahead)
return buf[:red]
def readinto(self, buf, readahead=None):
red = self.client.read(self._fd, buf, readahead=readahead)
self._off += red
return red
def write(self, buf, flush=False):
writ = self.client.write(self._fd, buf, flush=flush)
self._off += writ
return writ

44
hf3fs/fuse.py Normal file
View File

@@ -0,0 +1,44 @@
import os
from pathlib import PosixPath
from hf3fs_py_usrbio import HF3FS_SUPER_MAGIC
HF3FS_IOC_GET_MOUNT_NAME = 2149607424
HF3FS_IOC_GET_PATH_OFFSET = 2147772417
HF3FS_IOC_GET_MAGIC_NUM = 2147772418
HF3FS_IOC_RECURSIVE_RM = 2147772426
def serverPath(p):
'''
从完整路径获取 client 接受的路径名
Args:
p: 待解析的路径名
Examples:
.. code-block:: python
import hf3fs.fuse
hf3fs.fuse.serverPath('/hf3fs-cluster/aaa/../cpu/abc/def')
'''
np = os.path.normpath(os.path.realpath(p))
return os.path.join('/', *PosixPath(np).parts[3:])
def mountName(p):
'''
从完整路径获取 mount name
Args:
p: 待解析的路径名
Examples:
.. code-block:: python
import hf3fs.fuse
hf3fs.fuse.mountName('/hf3fs-cluster/aaa/../cpu/abc/def')
'''
np = os.path.normpath(os.path.realpath(p))
return PosixPath(np).parts[2]

0
hf3fs_fuse/__init__.py Normal file
View File

7
hf3fs_fuse/fuse.py Normal file
View File

@@ -0,0 +1,7 @@
import os
from pathlib import PosixPath
def get_mount_point(p):
np = os.path.realpath(p)
parts = PosixPath(np).parts
return os.path.join(*parts[:3])

30
hf3fs_fuse/fuse_demo.py Normal file
View File

@@ -0,0 +1,30 @@
from hf3fs_fuse.io import make_iovec, make_ioring, register_fd, deregister_fd
from multiprocessing.shared_memory import SharedMemory
import os
# Create memory for IO
shm = SharedMemory(size=1024, create=True)
iov = make_iovec(shm, '/hf3fs-cluster', 0, -1) # shm, mountpoint, blocksize, numa
shm.unlink() # shm can be unlinked after make_iovec
# Create ioring for IO
ior = make_ioring('/hf3fs-cluster', 100, True, 0) # mountpoint, num_entries, for_read, io_depth
# Open file
fd = os.open('/hf3fs-cluster/testread', os.O_RDONLY)
register_fd(fd) # must register after open to use usrbio
# Read file
ios = [(iov[:512], fd, 512), (iov[512:], fd, 0)] # iov, fd, offset
for io in ios:
ior.prepare(io[0], True, io[1], io[2], userdata=io) # iov, for_read, fd, offset, userdata
# Only for_read == True is allowed, because ior has for_read == True
# userdata must be a referenced python object, we reference io in the list ios, so it will not be sent to GC
resv = ior.submit().wait(min_results=2)
for res in resv:
print(res.result)
assert res.result == len(memoryview(res.userdata[0])) # Check read length is correct
# Close file
deregister_fd(fd) # must deregister before close
os.close(fd)

139
hf3fs_fuse/io.py Normal file
View File

@@ -0,0 +1,139 @@
import hf3fs_py_usrbio as h3fio
from hf3fs_py_usrbio import register_fd, deregister_fd, force_fsync, extract_mount_point, hardlink, punch_hole
import multiprocessing.shared_memory
import os
import os.path
from uuid import uuid4
class iovec:
def __init__(self, iov, link):
self.iov = iov
self.link = link
def __del__(self):
os.unlink(self.link)
def __getitem__(self, slice):
return self.iov[slice]
def __setitem__(self, slice, val):
self.iov[slice] = val
class ioring:
def __init__(self, ior):
self.ior = ior
@staticmethod
def size_for_entries(entries):
return h3fio.ioring.size_for_entries(entries)
def prepare(self, iov, *args, **kwargs):
if type(iov) == iovec:
return self.ior.prepare(iov.iov, *args, **kwargs)
else:
return self.ior.prepare(iov, *args, **kwargs)
def submit(self):
return self.ior.submit()
def wait(self, *args, **kwargs):
return self.ior.wait(*args, **kwargs)
class IorPriority(object):
HIGH = -1
NORMAL = 0
LOW = 1
def make_iovec(shm, hf3fs_mount_point, block_size=0, numa=-1):
'''
创建 iovec 对象
Args:
shm: Python multiprocessing.shared_memory.SharedMemory 对象
hf3fs_mount_point: hf3fs 挂载点
block_size: 默认为 0代表整体视为一个 block系统会按照 block_size 按块分配内存,防止触发 IB 注册驱动问题
numa: 默认为 -1代表不进行 numa 绑定,指定此参数可以指定内存绑定到固定 numa
'''
id = str(uuid4())
target = os.path.normpath(f'/dev/shm/{shm.name}')
link = f'{hf3fs_mount_point}/3fs-virt/iovs/{id}{f".b{block_size}" if block_size > 0 else ""}'
os.symlink(target, link)
return iovec(h3fio.iovec(shm.buf, id, hf3fs_mount_point, block_size, numa), link)
def make_ioring(hf3fs_mount_point, entries, for_read=True, io_depth=0, priority=None, timeout=None, numa=-1, flags=0):
'''
创建 ioring 对象
可以用 io_depth 参数来控制读取策略,有以下三种情况:
io_depth = 0 时ioring 每次后台扫描任务或被通知有后台任务时,提交全部 io
io_depth > 0 时ioring 每次提交 io_depth 个 io用户需保证最终有足量任务否则 ioring 在 wait 时会卡住
io_depth < 0 时ioring 每次后台扫描任务或被通知有后台任务时,提交最多 -io_depth 个 io
Args:
hf3fs_mount_point: hf3fs 挂载点
entries: ioring 最大存放 io 操作的个数
for_read: 指定该 ioring 执行的操作,读为 True写为 False
io_depth: 指定读取策略
numa: 默认为 -1代表不进行 numa 绑定,指定此参数可以指定将 ioring 通信使用的内存绑定到固定 numa
flags: 读写操作的一些额外选项现在有用取值的主要是2可以在读到洞的时候报错而不是填0
'''
return ioring(h3fio.ioring(hf3fs_mount_point, entries, for_read, io_depth, priority, timeout, numa, flags))
# @param cb if set, will callback with read data and current offset, will return None
# suggested use is to use cb
def read_file(fn, hf3fs_mount_point=None, block_size=1 << 30, off=0, priority=None, cb=None):
if hf3fs_mount_point is None:
hf3fs_mount_point = extract_mount_point(fn)
bufs = []
try:
fd = os.open(fn, os.O_RDONLY)
register_fd(fd)
shm = multiprocessing.shared_memory.SharedMemory(size=block_size, create=True)
iov = make_iovec(shm, hf3fs_mount_point)
ior = make_ioring(hf3fs_mount_point, 1, priority=priority)
i = 0
roff = off
while True:
ior.prepare(iov[:], True, fd, roff)
done = ior.submit().wait(min_results=1)[0]
if done.result < 0:
raise OSError(-done.result)
if done.result == 0:
break
if cb is None:
bufs.append(bytes(shm.buf[:done.result]))
else:
res = cb(shm.buf[:done.result], roff)
if type(res) == int:
roff = res
continue
elif res:
return
if done.result < block_size:
break
i += 1
roff += block_size
if cb is not None:
return
if len(bufs) == 1:
return bufs[0]
else:
return b''.join(bufs)
finally:
deregister_fd(fd)
os.close(fd)
del ior
del iov
shm.close()
shm.unlink()

37
hf3fs_utils/README.md Normal file
View File

@@ -0,0 +1,37 @@
# hf3fs_cli
build:
```bash
python3 setup_hf3fs_utils.py bdist_wheel
```
usage:
```.bash
$ hf3fs_cli rmtree --help
Usage: hf3fs_cli rmtree [OPTIONS] [DIR_PATHS]...
Move a directory tree to the trash and set an expiration time, it will be automatically deleted after expiration
Example:
hf3fs_cli rmtree <path/to/remove> --expire <expire_time>
- Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time, the directory will be deleted after expiration.
- Before expiration, you can restore the directory from the trash using `hf3fs_cli mv <trash_path> <target_path>`.
- If you need to free up space immediately, you can use `hf3fs_cli rmtree <trash_path>` to delete the data in the trash immediately, this operation cannot be undone!
- Use `ls /path/to/hf3fs/trash` to view the trash.
Options:
--expire [1h|3h|8h|1d|3d|7d] Expiration time, contents in the trash will be automatically deleted after expiration
-y, --yes Skip confirmation prompt and delete immediately
--help Show this message and exit.
$ hf3fs_cli mv --help
Usage: hf3fs_cli mv [OPTIONS] OLD_PATH NEW_PATH
Move files, supports moving files between different mount points within the same 3FS
Options:
--help Show this message and exit.
```
If you want to use `rmtree` command, the administrator needs to create a trash directory for each user at `/{3fs_mountpoint}/trash/{user_name}`. The cleanup of the trash directory is handled by the `trash_cleaner`. For instructions on how to use it, please refer to `src/client/trash_cleaner/`.

0
hf3fs_utils/__init__.py Normal file
View File

192
hf3fs_utils/cli.py Normal file
View File

@@ -0,0 +1,192 @@
import errno
import click
import os
import sys
import stat
from typing import Optional, List
from hf3fs_utils.fs import is_relative_to, FileSystem
from hf3fs_utils.trash import TRASH_CONFIGS, Trash
MOUNTPOINT = os.environ.get("HF3FS_CLI_MOUNTPOINT", None)
def get_filesystem(path: str) -> FileSystem:
mountpoint = None
if MOUNTPOINT is not None:
mountpoint = os.path.abspath(MOUNTPOINT)
else:
path = os.path.realpath(path)
parts = path.split(os.sep)
for i in range(1, 4):
p = os.sep.join(parts[:i])
if os.path.exists(os.path.join(p, "3fs-virt")):
mountpoint = p
break
if not mountpoint:
abort(f"{path} is not on 3FS")
return FileSystem(mountpoint)
def abs_path(path: str) -> str:
if ".." in path.split(os.path.sep):
abort(f"Path {path} contains '..', which is not supported yet")
normpath = os.path.normpath(path)
# If the user calls rmtree path/symlink, it should delete the symlink instead of the path it points to
# For dir paths, take the realpath, but keep the filename as is
dir = os.path.dirname(normpath)
filename = os.path.basename(normpath)
return os.path.join(os.path.realpath(dir), filename)
def abort(msg):
click.echo(click.style(msg, fg="red"), err=True)
sys.exit(1)
@click.group()
def cli():
"""
3FS command-line tool
"""
@cli.command()
@click.argument("old_path", type=click.Path(exists=True))
@click.argument("new_path", type=click.Path())
def mv(old_path: str, new_path: str):
"""
Move files, supports moving files between different mount points within the same 3FS
"""
try:
old_path = abs_path(old_path)
new_path = abs_path(new_path)
try:
new_st = os.stat(new_path, follow_symlinks=True)
# new_path exists, should be a directory
if not stat.S_ISDIR(new_st.st_mode):
raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), new_path)
# move to new_path/filename
new_path = os.path.join(new_path, os.path.basename(old_path))
except FileNotFoundError:
pass
fs = get_filesystem(old_path)
fs.rename(old_path, new_path)
click.echo(f"Move successful: {old_path} -> {new_path}")
except AssertionError:
raise
except Exception as ex:
abort(f"Move failed: {ex}")
class ExpireType(click.ParamType):
def get_metavar(self, param) -> str:
return "[1h|3h|8h|1d|3d|7d]"
def convert(self, value, param, ctx):
norm_value = value
if norm_value.endswith("hour"):
norm_value = norm_value.replace("hour", "h")
elif norm_value.endswith("hours"):
norm_value = norm_value.replace("hours", "h")
elif norm_value.endswith("day"):
norm_value = norm_value.replace("day", "d")
elif norm_value.endswith("days"):
norm_value = norm_value.replace("days", "d")
if norm_value not in TRASH_CONFIGS.keys():
self.fail(f"{value} is invalid, valid options are {self.get_metavar()}", param, ctx)
else:
return norm_value
@cli.command()
@click.argument("dir_paths", type=click.Path(exists=True), nargs=-1)
@click.option(
"--expire",
type=ExpireType(),
help="Expiration time, contents in the trash will be automatically deleted after expiration",
)
@click.option("-y", "--yes", is_flag=True, default=False, help="Skip confirmation prompt and delete immediately")
def rmtree(dir_paths: List[str], expire: Optional[str], yes: bool):
"""
Move a directory tree to the trash and set an expiration time, it will be automatically deleted after expiration
\b
Example:
hf3fs_cli rmtree <path/to/remove> --expire <expire_time>
\b
- Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time, the directory will be deleted after expiration.
- Before expiration, you can restore the directory from the trash using `hf3fs_cli mv <trash_path> <target_path>`.
- If you need to free up space immediately, you can use `hf3fs_cli rmtree <trash_path>` to delete the data in the trash immediately, this operation cannot be undone!
- Use `ls /hf3fs/{cluster}/trash` to view the trash.
"""
if not dir_paths:
abort(f"Please provide the directory path to delete")
first_path = abs_path(dir_paths[0])
fs = get_filesystem(first_path)
fs_trash = Trash(fs)
clean_trash = is_relative_to(first_path, fs_trash.trash_path)
if not clean_trash:
if not expire:
abort(f"Use --expire [1h|3h|8h|1d|3d|7d] to specify the expiration time")
elif expire:
abort(f"{first_path} is already in the trash")
trash_cfg = TRASH_CONFIGS[expire] if not clean_trash else None
dir_paths = [abs_path(p) for p in dir_paths]
for dir_path in dir_paths:
if is_relative_to(dir_path, fs_trash.trash_path) != clean_trash:
if clean_trash:
abort(f"{dir_path} is not in the trash")
else:
abort(f"{dir_path} is already in the trash")
if clean_trash:
if len(dir_paths) != 1:
msg = (
f"Immediately delete the following paths:\n"
+ "\n".join([f"- {p}" for p in dir_paths])
+ "\nThis operation cannot be undone"
)
else:
msg = f"Immediately delete {dir_path}, this operation cannot be undone"
else:
if len(dir_paths) != 1:
msg = (
f"Move the following paths to the trash:\n"
+ "\n".join([f"- {p}" for p in dir_paths])
+ f"\nThey will be automatically deleted after {expire}"
)
else:
msg = f"Move {dir_path} to the trash, it will be automatically deleted after {expire}"
if not yes:
assert click.confirm(msg, abort=True)
for dir_path in dir_paths:
try:
if clean_trash:
fs.remove(dir_path, recursive=True)
click.echo(f"- Deleted {dir_path}")
else:
trash_path = fs_trash.move_to_trash(dir_path, trash_cfg)
click.echo(f"- Trash path: {trash_path}")
except AssertionError:
raise
except Exception as ex:
abort(f"Failed to delete {dir_path}: {ex}")
if not clean_trash:
click.echo(
"- Before expiration, you can use 'hf3fs_cli mv <trash_path> <target_path>' to restore, "
"or use 'hf3fs_cli rmtree <trash_path>' to delete immediately and free up space"
)
if __name__ == "__main__":
cli()

234
hf3fs_utils/fs.py Normal file
View File

@@ -0,0 +1,234 @@
import os
import fcntl
import errno
import struct
import stat
import sys
import pathlib
from typing import Tuple
def is_relative_to(path1, path2) -> bool:
try:
pathlib.PurePath(path1).relative_to(path2)
return True
except:
return False
class FileSystem:
HF3FS_IOCTL_MAGIC_CMD = 0x80046802
HF3FS_IOCTL_MAGIC_NUM = 0x8F3F5FFF
HF3FS_IOCTL_VERSION_CMD = 0x80046803
HF3FS_IOCTL_RENAME_CMD = 0x4218680E
HF3FS_IOCTL_RENAME_BUFFER_SIZE = 536
HF3FS_IOCTL_REMOVE_CMD = 0x4110680F
HF3FS_IOCTL_REMOVE_BUFFER_SIZE = 272
def __init__(self, mountpoint: str) -> None:
self.mountpoint = os.path.realpath(mountpoint)
self.virt_path = os.path.join(self.mountpoint, "3fs-virt")
# Check if the mount point is a directory
if not os.path.exists(self.mountpoint):
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), self.mountpoint
)
if not os.path.isdir(self.mountpoint):
raise NotADirectoryError(
errno.ENOTDIR, os.strerror(errno.ENOTDIR), self.mountpoint
)
virt_fd = None
try:
# Check the 3fs-virt directory
virt_fd = os.open(self.virt_path, os.O_RDONLY | os.O_DIRECTORY)
virt_st = os.fstat(virt_fd)
if not stat.S_ISDIR(virt_st.st_mode):
raise NotADirectoryError(
errno.ENOTDIR, os.strerror(errno.ENOTDIR), self.virt_path
)
self.st_dev = virt_st.st_dev
# Check the magic number
buffer = bytearray(4)
try:
self._ioctl(virt_fd, FileSystem.HF3FS_IOCTL_MAGIC_CMD, buffer)
except OSError:
raise RuntimeError(f"{self.mountpoint} is not a 3FS mount point")
magic_number = struct.unpack("I", buffer)[0]
expected_magic_number = FileSystem.HF3FS_IOCTL_MAGIC_NUM
if magic_number != expected_magic_number:
raise RuntimeError(
f"{self.mountpoint} is not a 3FS mount point, "
f"magic number {magic_number:x} != {expected_magic_number:x}"
)
# Check if the required ioctl is supported
ioctl_version = -1
try:
buffer = bytearray(4)
self._ioctl(virt_fd, FileSystem.HF3FS_IOCTL_VERSION_CMD, buffer)
ioctl_version = int.from_bytes(buffer, sys.byteorder, signed=False)
except OSError:
pass
assert ioctl_version >= 1
finally:
if virt_fd is not None:
os.close(virt_fd)
def _check_user(self):
if os.geteuid() == 0 or os.getegid() == 0:
raise RuntimeError(f"root user not allowed")
def _encode_filename(self, name: str) -> bytes:
assert name and os.sep not in name, name
name_bytes = name.encode("utf8")
if len(name_bytes) > 255:
raise OSError(errno.ENAMETOOLONG, os.strerror(errno.ENAMETOOLONG), name)
return name_bytes
def opendir(self, dir_path: str) -> Tuple[int, os.stat_result]:
dir_fd = os.open(dir_path, os.O_DIRECTORY | os.O_RDONLY)
try:
try:
dir_st = os.fstat(dir_fd)
except OSError as ex:
ex.filename = dir_path
raise
if not stat.S_ISDIR(dir_st.st_mode):
raise NotADirectoryError(
errno.ENOTDIR, os.strerror(errno.ENOTDIR), dir_path
)
if dir_st.st_dev != self.st_dev:
raise RuntimeError(f"{dir_path} is not under the 3FS mount point {self.mountpoint}")
if dir_st.st_ino & 0xF000000000000000:
raise RuntimeError(f"{dir_path} is a virtual path")
return dir_fd, dir_st
except:
os.close(dir_fd)
raise
def split_path(self, path: str) -> Tuple[int, os.stat_result, str]:
filename = os.path.basename(path)
if not filename:
raise RuntimeError(f"{path} has no filename")
if filename in [".", ".."]:
raise RuntimeError(f"{path} filename is {filename}")
if len(filename.encode("utf8")) > 255:
raise OSError(errno.ENAMETOOLONG, os.strerror(errno.ENAMETOOLONG), path)
dir = os.path.dirname(path) or "."
dir_fd, dir_st = self.opendir(dir)
return dir_fd, dir_st, filename
def rename(self, old_path: str, new_path: str) -> None:
self._check_user()
if is_relative_to(
os.path.realpath(new_path), os.path.join(self.mountpoint, "trash")
):
raise RuntimeError(f"{new_path} is in the trash")
old_dir_fd = None
new_dir_fd = None
try:
old_dir_fd, old_dir_st, old_filename = self.split_path(old_path)
new_dir_fd, new_dir_st, new_filename = self.split_path(new_path)
try:
old_st = os.stat(old_filename, dir_fd=old_dir_fd, follow_symlinks=False)
if stat.S_ISLNK(old_st.st_mode):
raise RuntimeError(f"{old_path} is symlink")
except OSError as ex:
ex.filename = old_path
raise
try:
os.stat(new_filename, dir_fd=new_dir_fd, follow_symlinks=False)
raise FileExistsError(errno.EEXIST, os.strerror(errno.EEXIST), new_path)
except FileNotFoundError:
pass
except OSError as ex:
ex.filename = new_path
raise
try:
self._rename_ioctl(
old_dir_fd,
old_dir_st.st_ino,
old_filename,
new_dir_st.st_ino,
new_filename,
False,
)
except OSError as ex:
ex.filename = old_path
ex.filename2 = new_path
raise
finally:
if old_dir_fd is not None:
os.close(old_dir_fd)
if new_dir_fd is not None:
os.close(new_dir_fd)
def _rename_ioctl(
self,
old_dir_fd: int,
old_dir_ino: int,
old_filename: str,
new_dir_ino: int,
new_filename: str,
move_to_trash: bool,
) -> None:
assert old_filename and not os.path.sep in old_filename, old_filename
assert new_filename and not os.path.sep in new_filename, new_filename
cmd = FileSystem.HF3FS_IOCTL_RENAME_CMD
buffer = struct.pack(
"N256sN256s?",
old_dir_ino,
self._encode_filename(old_filename),
new_dir_ino,
self._encode_filename(new_filename),
move_to_trash,
).ljust(FileSystem.HF3FS_IOCTL_RENAME_BUFFER_SIZE)
self._ioctl(old_dir_fd, cmd, buffer)
def remove(self, path: str, recursive: bool) -> None:
dir_fd = None
try:
dir_fd, dir_st, filename = self.split_path(path)
st = os.stat(filename, dir_fd=dir_fd, follow_symlinks=False)
if stat.S_ISLNK(st.st_mode):
raise RuntimeError(f"{path} is symlink")
if stat.S_ISDIR(st.st_mode) and recursive:
# The user must be the owner of the directory and have rwx permissions
imode = stat.S_IMODE(st.st_mode)
if st.st_uid != os.geteuid() or (imode & 0o700) != 0o700:
raise PermissionError(errno.EPERM, os.strerror(errno.EPERM), path)
try:
self._remove_ioctl(dir_fd, dir_st.st_ino, filename, recursive)
except OSError as ex:
ex.filename = path
raise ex
finally:
if dir_fd is not None:
os.close(dir_fd)
def _remove_ioctl(
self, parent_fd: int, parent_ino: int, filename: str, recursive: bool
) -> None:
assert filename and os.sep not in filename, filename
cmd = FileSystem.HF3FS_IOCTL_REMOVE_CMD
buffer = struct.pack(
"N256s?", parent_ino, self._encode_filename(filename), recursive
).ljust(FileSystem.HF3FS_IOCTL_REMOVE_BUFFER_SIZE)
self._ioctl(parent_fd, cmd, buffer)
def _ioctl(self, fd: int, cmd: int, buffer):
self._check_user()
return fcntl.ioctl(fd, cmd, buffer)

5
hf3fs_utils/hf3fs_cli Normal file
View File

@@ -0,0 +1,5 @@
#!/usr/bin/env python
from hf3fs_utils import cli
if __name__ == "__main__":
cli.cli(max_content_width=120)

176
hf3fs_utils/trash.py Normal file
View File

@@ -0,0 +1,176 @@
import os
import dataclasses
import pwd
import stat
import errno
import time
from typing import Optional
from datetime import datetime, timedelta, timezone
from . import fs
UTC8_TZ = timezone(timedelta(hours=8))
DATE_FORMAT = "%Y%m%d_%H%M"
BASE_TIMESTAMP = int(datetime(year=1980, month=1, day=1, tzinfo=UTC8_TZ).timestamp())
def format_date(t: datetime) -> str:
assert t.tzinfo
return t.astimezone(tz=UTC8_TZ).strftime(DATE_FORMAT)
def parse_date(t: str) -> datetime:
return datetime.strptime(t, DATE_FORMAT).replace(tzinfo=UTC8_TZ)
def get_timestamp_us() -> int:
timestamp_seconds = time.time()
return int(timestamp_seconds * 1_000_000)
@dataclasses.dataclass
class TrashConfig:
name: str
expire: timedelta
time_slice: timedelta
def __post_init__(self):
assert self.name and "-" not in self.name, f"invalid name {self.name}"
assert self.expire >= timedelta(minutes=1), self.expire
assert self.time_slice >= timedelta(minutes=1), self.time_slice
assert self.time_slice < self.expire, (self.time_slice, self.expire)
def current_dir(self) -> str:
base_timestamp = BASE_TIMESTAMP
current_timestamp = int(datetime.now(tz=UTC8_TZ).timestamp())
assert current_timestamp > base_timestamp, current_timestamp
time_slice_seconds = int(self.time_slice.total_seconds())
expire_seconds = int(self.expire.total_seconds())
assert time_slice_seconds and expire_seconds, repr(self)
start_timestamp = (
(current_timestamp - base_timestamp) // time_slice_seconds
) * time_slice_seconds + base_timestamp
end_timestamp = start_timestamp + expire_seconds + time_slice_seconds
start_datetime = datetime.fromtimestamp(start_timestamp, tz=UTC8_TZ)
end_datetime = datetime.fromtimestamp(end_timestamp, tz=UTC8_TZ)
return f"{self.name}-{format_date(start_datetime)}-{format_date(end_datetime)}"
TRASH_CONFIGS = {
"1h": TrashConfig("1h", timedelta(hours=1), timedelta(minutes=10)),
"3h": TrashConfig("3h", timedelta(hours=3), timedelta(minutes=30)),
"8h": TrashConfig("8h", timedelta(hours=8), timedelta(minutes=30)),
"1d": TrashConfig("1d", timedelta(days=1), timedelta(hours=1)),
"3d": TrashConfig("3d", timedelta(days=3), timedelta(days=1)),
"7d": TrashConfig("7d", timedelta(days=7), timedelta(days=1)),
}
class Trash:
def __init__(
self,
filesystem: fs.FileSystem,
user: Optional[int] = None,
user_name: Optional[str] = None,
) -> None:
if user is None:
user = os.geteuid()
assert isinstance(user, int), user
if user_name is None:
user_name = pwd.getpwuid(user).pw_name
if user == 0:
raise RuntimeError(f"hf3fs trash does not support root user")
# Check if the trash directory is mounted
trash = os.path.join(filesystem.mountpoint, "trash")
if not os.path.exists(trash):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), trash)
# Check if the user's trash directory exists
user_trash = os.path.join(filesystem.mountpoint, "trash", user_name)
if not os.path.exists(user_trash):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), user_trash)
user_trash_fd, user_trash_st = filesystem.opendir(user_trash)
os.close(user_trash_fd)
assert stat.S_ISDIR(user_trash_st.st_mode)
if user_trash_st.st_uid != user:
raise RuntimeError(
f"Trash directory {user_trash}, owner {user_trash_st.st_uid} != {user}"
)
self.filesystem = filesystem
self.user = user
self.user_name = user_name
self.trash_path = trash
self.user_trash_path = user_trash
def _check_user(self):
euid = os.geteuid()
if euid != self.user:
raise RuntimeError(f"euid {euid} != trash owner {self.user}")
def move_to_trash(
self,
path: str,
config: TrashConfig,
trash_name: Optional[str] = None,
append_timestamp_if_exists: bool = True,
) -> str:
self._check_user()
assert isinstance(config, TrashConfig), f"invalid trash config {config}"
dir_fd = None
trash_dir_fd = None
try:
dir_fd, dir_st, filename = self.filesystem.split_path(path)
try:
st = os.stat(filename, dir_fd=dir_fd, follow_symlinks=False)
except OSError as ex:
ex.filename = path
raise
if stat.S_ISDIR(st.st_mode):
# The user must be the owner of the directory and have rwx permissions.
imode = stat.S_IMODE(st.st_mode)
if st.st_uid != os.geteuid() or (imode & 0o700) != 0o700:
raise PermissionError(errno.EPERM, os.strerror(errno.EPERM), path)
trash_dir = os.path.join(self.user_trash_path, config.current_dir())
try:
os.mkdir(trash_dir, 0o755)
except FileExistsError:
pass
trash_dir_fd, trash_dir_st = self.filesystem.opendir(trash_dir)
trash_name = trash_name or filename
current_trash_name = trash_name
retry = 0
while True:
retry += 1
try:
self.filesystem._rename_ioctl(
dir_fd,
dir_st.st_ino,
filename,
trash_dir_st.st_ino,
current_trash_name,
True,
)
return os.path.join(trash_dir, current_trash_name)
except OSError as ex:
if (
ex.errno in (errno.ENOTDIR, errno.EEXIST, errno.ENOTEMPTY)
and append_timestamp_if_exists
and retry < 10
):
current_trash_name = f"{trash_name[0:200]}.{get_timestamp_us()}"
else:
raise
finally:
if dir_fd is not None:
os.close(dir_fd)
if trash_dir_fd is not None:
os.close(trash_dir_fd)

View File

@@ -0,0 +1,7 @@
Copyright (c) 2018 Pranav Srinivas Kumar <pranav.srinivas.kumar@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

2292
licenses/arrow/LICENSE.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,207 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-------------------------------------------------------------------------------
SOFTWARE DISTRIBUTED WITH FOUNDATIONDB:
The FoundationDB software includes a number of subcomponents with separate
copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
-------------------------------------------------------------------------------

27
licenses/leveldb/LICENSE Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,25 @@
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 - 2024 Daniil Goncharov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

21
licenses/nameof/LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2016 - 2024 Daniil Goncharov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

29
licenses/rapidcsv/LICENSE Normal file
View File

@@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2017, Kristofer Berggren
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018-2021 Martin Ankerl
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

23
licenses/smhasher/LICENSE Normal file
View File

@@ -0,0 +1,23 @@
All MurmurHash source files are placed in the public domain.
The license below applies to all other code in SMHasher:
Copyright (c) 2011 Google, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@@ -0,0 +1,16 @@
MIT License
Copyright (c) Mark Gillard <mark.gillard@outlook.com.au>
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

24
licenses/utf8.h/LICENSE Normal file
View File

@@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

17
patches/apply.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
set -e
cd "$(dirname "$0")"
if git -C ../third_party/rocksdb apply --reverse --check ../../patches/rocksdb.patch &>/dev/null; then
echo "rocksdb patch already applied. skipping."
else
git -C ../third_party/rocksdb apply ../../patches/rocksdb.patch
fi
if git -C ../third_party/folly apply --reverse --check ../../patches/folly.patch &>/dev/null; then
echo "folly patch already applied. skipping."
else
git -C ../third_party/folly apply ../../patches/folly.patch
fi

1437
patches/folly.patch Normal file

File diff suppressed because it is too large Load Diff

13
patches/rocksdb.patch Normal file
View File

@@ -0,0 +1,13 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9e506951..79a68507f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -620,7 +620,7 @@ set(SOURCES
db/blob/blob_source.cc
db/blob/prefetch_buffer_collection.cc
db/builder.cc
- db/c.cc
+ # db/c.cc
db/column_family.cc
db/compaction/compaction.cc
db/compaction/compaction_iterator.cc

94
setup.py Normal file
View File

@@ -0,0 +1,94 @@
import os
import re
import subprocess
import sys
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
#import setuptools_scm as stscm
import subprocess
#version = stscm.get_version(root=".", relative_to=__file__)
rev = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').rstrip()
version = "1.2.9+" + rev
print('package version', version)
# A CMakeExtension needs a sourcedir instead of a file list.
# The name must be the _single_ output extension from the CMake build.
# If you need multiple extensions, see scikit-build.
class CMakeExtension(Extension):
def __init__(self, name, sourcedir=""):
Extension.__init__(self, name, sources=[])
self.sourcedir = os.path.abspath(sourcedir)
class CMakeBuild(build_ext):
def build_extension(self, ext):
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
# required for auto-detection & inclusion of auxiliary "native" libs
if not extdir.endswith(os.path.sep):
extdir += os.path.sep
debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
cfg = "Debug" if debug else "RelWithDebInfo"
# CMake lets you override the generator - we need to check this.
# Can be set with Conda-Build, for example.
cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
# Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
# EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
# from Python.
cmake_args = [
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
"-DCMAKE_CXX_COMPILER=clang++-14",
"-DCMAKE_C_COMPILER=clang-14",
"-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
"-DUSE_RTTI=ON",
"-DOVERRIDE_CXX_NEW_DELETE=OFF",
"-DSAVE_ALLOCATE_SIZE=OFF",
"-DFOLLY_DISABLE_LIBUNWIND=ON",
]
build_args = []
# Adding CMake arguments set as environment variable
# (needed e.g. to build for ARM OSx on conda-forge)
if "CMAKE_ARGS" in os.environ:
cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
# # In this example, we pass in the version to C++. You might not need to.
#cmake_args += [f"-DPYCLIENT_VERSION_INFO={self.distribution.get_version()}"]
# Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
# across all generators.
if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
# self.parallel is a Python 3 only way to set parallel jobs by hand
# using -j in the build_ext call, not supported by pip or PyPA-build.
if hasattr(self, "parallel") and self.parallel:
# CMake 3.12+ only.
build_args += [f"-j{self.parallel}"]
build_temp = self.build_temp #os.path.join(self.build_temp, ext.name)
if not os.path.exists(build_temp):
os.makedirs(build_temp)
subprocess.check_call(["cmake", "-S", ext.sourcedir] + cmake_args, cwd=build_temp)
subprocess.check_call(["cmake", "--build", ".", "--target", "hf3fs_py_usrbio"] + build_args, cwd=build_temp)
# The information here can also be placed in setup.cfg - better separation of
# logic and declaration, and simpler if you include description/version in a file.
setup(
name="hf3fs_py_usrbio",
version=version,
description="Python binding for hf3fs client library",
long_description="",
packages=['hf3fs_fuse'],
ext_modules=[CMakeExtension("hf3fs_py_usrbio")],
cmdclass={"build_ext": CMakeBuild},
zip_safe=False,
extras_require={"test": ["pytest>=6.0"]},
python_requires=">=3.6",
)

Some files were not shown because too many files have changed in this diff Show More