Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

3
src/lib/CMakeLists.txt Normal file
View File

@@ -0,0 +1,3 @@
add_subdirectory(common)
add_subdirectory(api)
add_subdirectory(py)

View File

@@ -0,0 +1,2 @@
target_add_lib(hf3fs_api client-lib-common storage-client procps numa rt)
target_add_shared_lib(hf3fs_api_shared client-lib-common storage-client procps numa rt)

805
src/lib/api/UsrbIo.cc Normal file
View File

@@ -0,0 +1,805 @@
#include <cstdint>
#include <fcntl.h>
#include <fmt/format.h>
#include <folly/logging/xlog.h>
#include <iostream>
#include <numa.h>
#include <sys/stat.h>
#include "common/logging/LogInit.h"
#include "common/utils/Duration.h"
#include "common/utils/Path.h"
#include "fuse/IoRing.h"
#include "lib/api/fuse.h"
#include "lib/api/hf3fs.h"
#include "lib/api/hf3fs_usrbio.h"
#include "lib/common/Shm.h"
struct Hf3fsInitLib {
Hf3fsInitLib() {
auto v = getenv("HF3FS_USRBIO_LIB_LOG");
hf3fs::logging::initOrDie(v && *v ? v : "WARN");
}
};
static Hf3fsInitLib initLib;
struct Hf3fsLibAliveness {
std::mutex mtx;
std::map<std::string, int> mountFds;
};
static Hf3fsLibAliveness alive;
bool hf3fs_is_hf3fs(int fd) {
uint32_t magic = 0;
auto res = ioctl(fd, hf3fs::lib::fuse::HF3FS_IOC_GET_MAGIC_NUM, &magic);
if (res) {
return false;
}
return magic == HF3FS_SUPER_MAGIC;
}
int hf3fs_extract_mount_point(char *hf3fs_mount_point, int size, const char *path) {
auto s = getenv("HF3FS_USRBIO_DONT_CHECKFS_FOR_MP");
auto checkFs = !(s && !strcmp(s, "yes"));
std::set<boost::filesystem::path> mps;
auto fp = fopen("/proc/self/mountinfo", "r");
XLOGF_IF(FATAL, !fp, "cannot read system mount info");
char line[4096];
std::vector<std::string> parts;
while (true) {
auto cp = fgets(line, sizeof(line), fp);
if (!cp) {
break;
}
// there are two kinds of mounts:
// 1. directly mount the whole hf3fs
// 2. mount some subdirectories separately
// for case 1, we find the root of the fs directly in the mountinfo
// for case 2, we find the parent of the 3fs-virt dir
parts.clear();
folly::split(' ', line, parts, true);
if (parts.size() < 10) {
continue;
}
if (checkFs && parts[parts.size() - 3] != "fuse.hf3fs") {
continue;
}
auto mp = boost::filesystem::path(parts[4]).lexically_normal();
if (mp.filename() == "3fs-virt") {
mps.insert(mp.parent_path());
} else {
try {
if (boost::filesystem::exists(mp / "3fs-virt")) {
mps.insert(mp);
}
} catch (const boost::filesystem::filesystem_error &) {
// ignore
}
}
}
auto rp = boost::filesystem::canonical(boost::filesystem::path(path)).lexically_normal();
for (auto it = mps.crbegin(); it != mps.crend(); ++it) {
auto &mp = *it;
auto [mpm, rpm] = std::mismatch(mp.begin(), mp.end(), rp.begin(), rp.end());
if (mpm == mp.end()) {
if ((int)mp.size() < size) {
strcpy(hf3fs_mount_point, mp.c_str());
}
return mp.size() + 1;
}
}
return -1;
}
int hf3fs_iovcreate_general(struct hf3fs_iov *iov,
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa,
bool is_io_ring,
bool for_read,
int io_depth,
int priority = 0,
int timeout = 0,
uint64_t flags = 0) {
if (!iov) {
return -EINVAL;
}
if (strlen(hf3fs_mount_point) >= sizeof(iov->mount_point)) {
XLOGF(ERR, "mount point too long '{}'", hf3fs_mount_point);
return -EINVAL;
}
auto p = fmt::format("/hf3fs-iov-{}", hf3fs::Uuid::random());
hf3fs::lib::ShmBuf *shm;
try {
shm = new hf3fs::lib::ShmBuf(p, size, block_size, numa, hf3fs::meta::Uid(getuid()), getpid(), getppid());
} catch (const std::runtime_error &e) {
XLOGF(ERR, "failed to create/map shm for iov {}", e.what());
return -EIO;
}
bool succ = false;
SCOPE_EXIT {
if (!succ) {
delete shm;
}
};
auto target = hf3fs::Path("/dev/shm") / p;
auto link = fmt::format("{}/3fs-virt/iovs/{}{}{}{}{}{}",
hf3fs_mount_point,
shm->id.toHexString(),
block_size ? fmt::format(".b{}", block_size) : std::string(),
is_io_ring ? fmt::format(".{}{}", for_read ? 'r' : 'w', io_depth) : std::string(),
is_io_ring && priority != 0 ? fmt::format(".p{}", priority < 0 ? 'h' : 'l') : std::string(),
is_io_ring ? fmt::format(".t{}", timeout) : std::string(),
is_io_ring && flags != 0 ? fmt::format(".f{:b}", flags) : std::string());
auto lres = symlink(target.c_str(), link.c_str());
if (lres < 0) {
XLOGF(ERR, "failed to register iov '{}' to hf3fs '{}'", target, link);
return -errno;
}
if (is_io_ring) {
shm->maybeUnlinkShm();
}
iov->base = shm->bufStart;
iov->iovh = shm;
memcpy(iov->id, shm->id.data, sizeof(iov->id));
strcpy(iov->mount_point, hf3fs_mount_point);
iov->size = size;
iov->block_size = block_size;
iov->numa = numa;
succ = true;
std::lock_guard lock(alive.mtx);
if (alive.mountFds.find(hf3fs_mount_point) == alive.mountFds.end()) {
auto fd = open(fmt::format("{}/3fs-virt/iovs", hf3fs_mount_point).c_str(), O_DIRECTORY);
alive.mountFds[hf3fs_mount_point] = fd;
XLOGF(INFO, "fd {} for mount {}", fd, hf3fs_mount_point);
}
return 0;
}
void hf3fs_iovdestroy_general(struct hf3fs_iov *iov,
bool is_io_ring,
bool for_read,
int io_depth,
int priority = 0,
int timeout = 0,
uint64_t flags = 0) {
if (!iov) {
return;
}
if (!iov->iovh) { // not allocated by hf3fs_iovalloc(), cannot be freed by hf3fs_iovfree()
if (!is_io_ring) {
XLOGF(ERR, "cannot iovfree() an iov created by iovwrap()");
}
return;
}
hf3fs::Uuid id;
memcpy(id.data, iov->id, sizeof(id.data));
auto link = fmt::format("{}/3fs-virt/iovs/{}{}{}{}{}{}",
iov->mount_point,
id.toHexString(),
iov->block_size ? fmt::format(".b{}", iov->block_size) : std::string(),
is_io_ring ? fmt::format(".{}{}", for_read ? 'r' : 'w', io_depth) : std::string(),
is_io_ring && priority != 0 ? fmt::format(".p{}", priority < 0 ? 'h' : 'l') : std::string(),
is_io_ring ? fmt::format(".t{}", timeout) : std::string(),
is_io_ring && flags != 0 ? fmt::format(".f{}", flags) : std::string());
unlink(link.c_str());
auto *shm = static_cast<hf3fs::lib::ShmBuf *>(iov->iovh);
if (!is_io_ring) {
shm->maybeUnlinkShm();
}
delete (hf3fs::lib::ShmBuf *)iov->iovh;
iov->iovh = nullptr;
}
int hf3fs_iovcreate(struct hf3fs_iov *iov, const char *hf3fs_mount_point, size_t size, size_t block_size, int numa) {
return hf3fs_iovcreate_general(iov, hf3fs_mount_point, size, block_size, numa, false, true, 0);
}
int hf3fs_iovopen(struct hf3fs_iov *iov,
const uint8_t id[16],
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa) {
hf3fs::Uuid uuid;
memcpy(uuid.data, id, sizeof(uuid.data));
auto link = fmt::format("{}/3fs-virt/iovs/{}{}",
hf3fs_mount_point,
uuid.toHexString(),
block_size ? fmt::format(".b{}", block_size) : std::string());
auto shm_path_c = realpath(link.c_str(), nullptr);
if (!shm_path_c) {
XLOGF(ERR, "hf3fs_iovopen realpath failed with errno {}", errno);
return -errno;
}
auto shm_path = std::string(shm_path_c);
free(shm_path_c);
std::string prefix("/dev/shm");
if (shm_path.substr(0, prefix.size()) != prefix) {
XLOGF(ERR, "hf3fs_iovopen shm_path is not in /dev/shm");
return -EINVAL;
}
shm_path = shm_path.substr(prefix.size());
hf3fs::lib::ShmBuf *shm;
try {
shm = new hf3fs::lib::ShmBuf(shm_path, 0, size, block_size, uuid);
} catch (const std::runtime_error &e) {
XLOGF(ERR, "hf3fs_iovopen failed to map shm for iov {}", e.what());
return -EIO;
}
bool succ = false;
SCOPE_EXIT {
if (!succ) {
delete shm;
}
};
iov->base = shm->bufStart;
iov->iovh = shm;
memcpy(iov->id, shm->id.data, sizeof(iov->id));
strcpy(iov->mount_point, hf3fs_mount_point);
iov->size = size;
iov->block_size = block_size;
iov->numa = numa;
succ = true;
return 0;
}
void hf3fs_iovunlink(struct hf3fs_iov *iov) {
auto *shm = static_cast<hf3fs::lib::ShmBuf *>(iov->iovh);
shm->maybeUnlinkShm();
}
void hf3fs_iovdestroy(struct hf3fs_iov *iov) { hf3fs_iovdestroy_general(iov, false, true, 0); }
size_t hf3fs_ior_size(int entries) { return hf3fs::fuse::IoRing::bytesRequired(entries); }
int hf3fs_iovwrap(struct hf3fs_iov *iov,
void *buf,
const uint8_t id[16],
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa) {
if (!iov) {
return -EINVAL;
}
if (strlen(hf3fs_mount_point) >= sizeof(iov->mount_point)) {
XLOGF(ERR, "mount point too long '{}'", hf3fs_mount_point);
return -EINVAL;
}
iov->iovh = nullptr;
iov->base = (uint8_t *)buf;
memcpy(iov->id, id, sizeof(iov->id));
strcpy(iov->mount_point, hf3fs_mount_point);
iov->size = size;
iov->block_size = block_size;
iov->numa = numa;
if (numa >= 0) {
numa_tonode_memory(buf, size, numa);
}
return 0;
}
struct Hf3fsIorHandle {
std::unique_ptr<hf3fs::fuse::IoRing> ior;
sem_t *submitSem;
};
static int cqeSem(sem_t *&sem, const char *hf3fs_mount_point, int prio) {
auto link = fmt::format("{}/3fs-virt/iovs/submit-ios{}",
std::string(hf3fs_mount_point),
prio == 0 ? ""
: prio < 0 ? ".ph"
: ".pl");
std::vector<char> target(256);
while (true) {
auto lres = readlink(link.c_str(), target.data(), target.size());
if (lres < 0) {
return -errno;
} else if (lres >= (ssize_t)target.size()) {
XLOGF(ERR, "hf3fs reports strange link target for submit sem");
return -EIO;
} else {
break;
}
}
auto semPath = hf3fs::Path(target).lexically_normal();
static const auto devShm = hf3fs::Path("/dev/shm");
auto [sm, pm] = std::mismatch(devShm.begin(), devShm.end(), semPath.begin(), semPath.end());
auto it = pm;
if (sm != devShm.end() || it == semPath.end() || ++it != semPath.end() || pm->native().size() <= 4 ||
pm->native().substr(0, 4) != "sem.") {
XLOGF(ERR, "invalid submit sem name from hf3fs '{}'", semPath);
return -EIO;
}
auto semName = std::string("/") + (pm->c_str() + 4);
sem = sem_open(semName.c_str(), 0);
if (!sem) {
return -errno;
}
return 0;
}
int hf3fs_iorwrap(struct hf3fs_ior *ior,
void *buf,
const char *hf3fs_mount_point,
size_t size,
bool for_read,
int io_depth,
int priority,
int timeout,
uint64_t flags) {
if (!ior || !buf || !hf3fs_mount_point || !*hf3fs_mount_point || !hf3fs::fuse::IoRing::ioRingEntries(size) ||
timeout < 0) {
return -EINVAL;
}
auto iorh = std::make_unique<Hf3fsIorHandle>(
Hf3fsIorHandle{std::make_unique<hf3fs::fuse::IoRing>(
std::shared_ptr<hf3fs::lib::ShmBuf>{},
"",
hf3fs::meta::UserInfo{hf3fs::meta::Uid{geteuid()}, hf3fs::meta::Gid{getegid()}, ""},
for_read,
(uint8_t *)buf,
size,
io_depth,
priority,
hf3fs::Duration(std::chrono::nanoseconds((uint64_t)timeout * 1000000)),
flags,
false),
nullptr});
if (strlen(hf3fs_mount_point) >= sizeof(ior->mount_point)) {
XLOGF(ERR, "mount point too long '{}'", hf3fs_mount_point);
return -EINVAL;
}
auto res = cqeSem(iorh->submitSem, hf3fs_mount_point, priority);
if (res < 0) {
return res;
}
ior->iorh = iorh.release();
strcpy(ior->mount_point, hf3fs_mount_point);
ior->for_read = for_read;
ior->io_depth = io_depth;
ior->priority = priority;
ior->timeout = timeout;
ior->flags = flags;
ior->iov.size = size;
return 0;
}
int hf3fs_iorcreate(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int numa) {
return hf3fs_iorcreate2(ior, hf3fs_mount_point, entries, for_read, io_depth, 0, numa);
}
int hf3fs_iorcreate2(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int priority,
int numa) {
return hf3fs_iorcreate3(ior, hf3fs_mount_point, entries, for_read, io_depth, priority, 0, numa);
}
int hf3fs_iorcreate3(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int priority,
int timeout,
int numa) {
if (!ior || !hf3fs_mount_point || !*hf3fs_mount_point || timeout < 0) {
return -EINVAL;
}
auto iov_size = hf3fs_ior_size(entries);
auto res = hf3fs_iovcreate_general(&ior->iov,
hf3fs_mount_point,
iov_size,
0,
numa,
true,
for_read,
io_depth,
priority,
timeout);
if (res < 0) {
XLOGF(ERR, "ioring create failed: hf3fs_iovcreate_general failed {}", res);
return res;
}
res = hf3fs_iorwrap(ior, ior->iov.base, hf3fs_mount_point, ior->iov.size, for_read, io_depth, priority, timeout, 0);
if (res < 0) {
hf3fs_iovdestroy(&ior->iov);
XLOGF(ERR, "ioring create failed: hf3fs_iorwrap failed {}", res);
return res;
}
return 0;
}
int hf3fs_iorcreate4(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int timeout,
int numa,
uint64_t flags) {
if (!ior || !hf3fs_mount_point || !*hf3fs_mount_point || timeout < 0) {
return -EINVAL;
}
auto iov_size = hf3fs_ior_size(entries);
auto res = hf3fs_iovcreate_general(&ior->iov,
hf3fs_mount_point,
iov_size,
0,
numa,
true,
for_read,
io_depth,
0,
timeout,
flags);
if (res < 0) {
XLOGF(ERR, "ioring create failed: hf3fs_iovcreate_general failed {}", res);
return res;
}
res = hf3fs_iorwrap(ior, ior->iov.base, hf3fs_mount_point, ior->iov.size, for_read, io_depth, 0, timeout, flags);
if (res < 0) {
hf3fs_iovdestroy(&ior->iov);
XLOGF(ERR, "ioring create failed: hf3fs_iorwrap failed {}", res);
return res;
}
return 0;
}
void hf3fs_iordestroy(struct hf3fs_ior *ior) {
if (!ior) {
return;
}
hf3fs_iovdestroy_general(&ior->iov, true, ior->for_read, ior->io_depth, ior->priority, ior->timeout, ior->flags);
if (ior->iorh) {
delete (Hf3fsIorHandle *)ior->iorh;
ior->iorh = nullptr;
}
}
struct Hf3fsRegisteredFd {
Hf3fsRegisteredFd(int f, int df, hf3fs::meta::InodeId i, int s)
: fd(f),
dupfd(df),
iid(i),
status(s) {}
~Hf3fsRegisteredFd() { close(dupfd); }
int fd;
int dupfd;
hf3fs::meta::InodeId iid;
int status;
};
static int noFiles() {
struct rlimit lim;
auto ret = getrlimit(RLIMIT_NOFILE, &lim);
if (ret < 0) {
XLOGF(FATAL, "cannot get limit of number of open files");
}
return lim.rlim_max;
}
using Hf3fsRegisteredFds = std::vector<folly::atomic_shared_ptr<Hf3fsRegisteredFd>>;
static Hf3fsRegisteredFds regfds(noFiles());
int hf3fs_reg_fd(int fd, uint64_t flags) {
(void)flags;
auto is3fs = hf3fs_is_hf3fs(fd);
if (!is3fs || fd >= (int)regfds.size()) {
return EBADF;
} else if (regfds[fd].load()) {
return EINVAL;
}
struct statx stx;
auto sres = statx(fd, "", AT_EMPTY_PATH | AT_STATX_DONT_SYNC, STATX_INO, &stx);
if (sres < 0) {
return errno;
}
auto dupfd = dup(fd);
if (dupfd < 0) {
return errno;
} else if (regfds[dupfd].load()) {
close(dupfd);
return EINVAL;
}
int status = fcntl(fd, F_GETFL);
std::shared_ptr<Hf3fsRegisteredFd> empty;
auto regfd = std::make_shared<Hf3fsRegisteredFd>(fd, dupfd, hf3fs::meta::InodeId{stx.stx_ino}, status);
if (!regfds[fd].compare_exchange_strong(empty, regfd)) {
return EINVAL; // already registered by another thread
}
if (!regfds[dupfd].compare_exchange_strong(empty, regfd)) {
empty.reset();
regfds[fd].store(empty);
return EINVAL;
}
return -dupfd;
}
void hf3fs_dereg_fd(int fd) {
auto is3fs = hf3fs_is_hf3fs(fd);
if (!is3fs || fd < 0 || fd >= (int)regfds.size()) {
return;
}
std::shared_ptr<Hf3fsRegisteredFd> empty;
auto regfd = regfds[fd].load();
if (!regfd) {
return;
}
auto regfd2 = regfd;
regfds[regfd->dupfd].compare_exchange_strong(regfd2, empty);
regfds[fd].compare_exchange_strong(regfd, empty);
}
int hf3fs_io_entries(const struct hf3fs_ior *ior) { return hf3fs::fuse::IoRing::ioRingEntries(ior->iov.size); }
int hf3fs_prep_io(const struct hf3fs_ior *ior,
const struct hf3fs_iov *iov,
bool read,
void *ptr,
int fd,
size_t off,
uint64_t len,
const void *userdata) {
auto p = (uint8_t *)ptr;
auto afd = abs(fd);
if (!ior || !ior->iorh || read != ior->for_read || !iov || len <= 0 || !iov->base || p < iov->base ||
p + len > iov->base + iov->size || afd >= (int)regfds.size()) {
return -EINVAL;
}
auto regfd = regfds[afd].load();
if (!regfd) { // fd not registered
return -EBADF;
}
int status = regfd->status;
if ((read && (status & O_ACCMODE) == O_WRONLY) || (!read && (status & O_ACCMODE) == O_RDONLY)) {
return -EACCES;
}
auto &iorh = *(Hf3fsIorHandle *)ior->iorh;
auto &ring = *iorh.ior;
auto idx = ring.slots.alloc();
if (!idx) { // ring is full
return -EAGAIN;
}
auto &args = ring.ringSection[*idx];
memcpy(args.bufId, iov->id, sizeof(iov->id));
args.bufOff = p - iov->base;
args.fileIid = regfd->iid.u64();
args.fileOff = off;
args.ioLen = len;
args.userdata = userdata;
{
auto res = ring.addSqe(*idx, userdata);
if (!res) {
XLOGF(ERR, "no more sqes when args are added");
ring.slots.dealloc(*idx);
return -EAGAIN;
}
}
hf3fs::Uuid id;
memcpy(id.data, ior->iov.id, sizeof(id.data));
return *idx;
}
int hf3fs_submit_ios(const struct hf3fs_ior *ior) {
if (!ior || !ior->iorh) {
return -EINVAL;
}
auto &iorh = *(Hf3fsIorHandle *)ior->iorh;
sem_post(iorh.submitSem);
return 0;
}
int hf3fs_wait_for_ios(const struct hf3fs_ior *ior,
struct hf3fs_cqe *cqes,
int cqec,
int min_results,
const struct timespec *abs_timeout) {
if (cqec <= 0 || !ior || !ior->iorh) {
return -EINVAL;
}
if (min_results > cqec) {
min_results = cqec;
}
auto jitter = 1;
auto js = getenv("HF3FS_USRBIO_WAIT_JITTER_MS");
if (js && atoi(js)) {
jitter = atoi(js);
}
auto &iorh = *(Hf3fsIorHandle *)ior->iorh;
auto &ring = *iorh.ior;
int filled = 0;
do {
auto done = ring.cqeCount();
if (done) {
done = std::min(done, cqec - filled);
for (auto i = 0; i < done; ++i) {
auto t = ring.cqeTail.load();
if (t == ring.cqeHead.load()) { // empty, drained by another consumer?
break;
}
const auto &cqe = ring.cqeSection[t];
// first record the info in curr cqe tail, if we inc first, the info may be overwritten
cqes[filled].index = cqe.index;
cqes[filled].result = cqe.result;
cqes[filled].userdata = cqe.userdata;
// then we inc the tail, to make sure we're the only one taking it as output
if (!ring.cqeTail.compare_exchange_strong(
t,
(t + 1) % ring.entries)) { // another thread is also popping out cqe, we'll yield
break;
}
// if we own the cqe, we can add it to our output
// if inc'ing cqe tail failed in the prev step, discard the recorded cqe info by not inc'ing filled
++filled;
ring.slots.dealloc(cqe.index);
}
// post sem to signal the available slots in cqe section
hf3fs_submit_ios(ior);
continue;
}
if (filled >= min_results) { // no more immediate results and we've got enough
return filled;
}
struct timespec start;
if (clock_gettime(CLOCK_REALTIME, &start) < 0) {
continue;
}
auto ts = start;
if (abs_timeout &&
(abs_timeout->tv_sec < ts.tv_sec ||
(abs_timeout->tv_sec == ts.tv_sec && abs_timeout->tv_nsec <= ts.tv_nsec))) { // already timed out
return filled;
}
auto nsec = ts.tv_nsec + jitter * 1000000;
ts.tv_nsec = nsec % 1000000000;
ts.tv_sec = ts.tv_sec + nsec / 1000000000;
if (abs_timeout && ts.tv_sec >= abs_timeout->tv_sec) {
ts.tv_sec = abs_timeout->tv_sec;
ts.tv_nsec = std::min(ts.tv_nsec, abs_timeout->tv_nsec);
}
// wait for cqe sem, don't care if it succeeds, times out, or even fails
// we check the cqe section again in any case
sem_timedwait(ring.cqeSem.get(), &ts);
} while (filled < cqec);
return filled;
}
int hf3fs_hardlink(const char *target, const char *link_name) {
int fd = open(target, O_RDONLY);
if (fd == -1) {
return errno;
}
SCOPE_EXIT { close(fd); };
std::filesystem::path link_path(link_name);
hf3fs::lib::fuse::Hf3fsIoctlHardlinkArg arg;
struct stat buf;
auto res = stat(link_path.parent_path().c_str(), &buf);
if (res != 0) {
return errno;
}
arg.ino = buf.st_ino;
strcpy(arg.str, link_path.filename().c_str());
res = ioctl(fd, hf3fs::lib::fuse::HF3FS_IOC_HARDLINK, &arg);
if (res != 0) {
return errno;
}
return 0;
}
int hf3fs_punchhole(int fd, int n, const size_t *start, const size_t *end, size_t flags) {
hf3fs::lib::fuse::Hf3fsIoctlPunchHoleArg arg;
arg.n = n;
arg.flags = flags;
for (int i = 0; i < n; i++) {
arg.start[i] = start[i];
arg.end[i] = end[i];
}
auto res = ioctl(fd, hf3fs::lib::fuse::HF3FS_IOC_PUNCH_HOLE, &arg);
if (res != 0) {
return errno;
}
return 0;
}

277
src/lib/api/UsrbIo.md Normal file
View File

@@ -0,0 +1,277 @@
# USRBIO API Reference
## Overview
User Space Ring Based IO, or USRBIO, is a set of high-speed I/O functions on 3FS. User applications can directly submit I/O requests to the 3FS I/O queue in the FUSE process via the USRBIO API, thereby bypassing certain limitations inherent to FUSE itself. For example, this approach avoids the maximum single I/O size restriction, which is notoriously unfriendly to network file systems. It also makes the data exchange between the user and FUSE processes.
## Concepts
**Iov**: A large shared memory region for zero-copy read/write operations, shared between the user and FUSE processes, with InfiniBand (IB) memory registration managed by the FUSE process. In the USRBIO API, all read data will be read into Iov, and all write data should be writen to Iov by user first.
**Ior**: A small shared memory ring for communication between user process and FUSE process. The usage of Ior is similar to Linux [io-uring](https://unixism.net/loti/index.html), where the user application enqueues read/write requests, and the FUSE process dequeues these requests for completion. The I/Os are executed in batches controlled by the `io_depth` parameter, and multiple batches will be executed in parallel, be they from different rings, or even from the same ring. However, multiple rings are still recommended for multi-threaded applications, as synchronization is unavoidable when sharing a ring.
**File descriptor Registration**: Functions are provided for file descriptor registration and deregistration. Only registered fds can be used for the USRBIO. The file descriptors in the user applicaiton are managed by the Linux kernel and the FUSE process has no way to know how they're actually associated with inode IDs it manages. The registration makes the I/O preparation function look more like the [uring counterpart](https://unixism.net/loti/ref-liburing/submission.html).
## Functions
### hf3fs_iorcreate4
#### Summary
Create an Ior instance. All `hf3fs_iorcreate*` functions create Ior instances, but include various configurable parameters due to compatibility considerations. The `struct hf3fs_ior` instance can be allocated on stack as a local variable or as a member field of another struct. The create functions will not allocate memory for it, and the destroy function will not deallocate. The `struct hf3fs_iov` is the same.
#### Syntax
```c
int hf3fs_iorcreate4(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int timeout,
int numa,
uint64_t flags);
```
#### Parameters
- **ior**: Address for `hf3fs_ior`.
- **hf3fs_mount_point**: Mount point for 3FS. This parameter is used to distinguish 3FS clusters, enabling a single machine to mount multiple 3FS instances.
- **entries**: Maximum number of concurrent read/write requests that can be submitted.
- **for_read**: `true` if this Ior handles read requests, `false` if this Ior handles write requests. An Ior cannot handle read requests and write requests simultaneously.
- **io_depth**: `0` for no control with I/O depth. If greater than 0, then only when `io_depth` I/O requests are in queue, they will be issued to server as a batch. If smaller than 0, then USRBIO will wait for at most `-io_depth` I/O requests are in queue and issue them in one batch.
- **timeout**: Maximum wait time for batching when `io_depth` < 0.
- **numa**: Numa ID for Ior shared memory. `-1` for current process numa ID.
- **flags**: A flag composed of OR-ed bits to specify special behaviors.
#### Return Value
- If success, return 0.
- If fail, return `-errno`.
#### Example
```c
struct hf3fs_ior ior;
hf3fs_iorcreate4(&ior, "/hf3fs/mount/point", 1024, true, 0, 0, -1, 0);
hf3fs_iordestroy(&ior);
```
### hf3fs_iordestroy
#### Summary
Destroy an Ior.
#### Syntax
```c
void hf3fs_destroy(struct hf3fs_ior *ior);
```
#### Parameters
- **ior**: Address for Ior.
### hf3fs_iovcreate2
#### Summary
Create an Iov instance and allocate shared memory for that Iov. All `hf3fs_iovcreate*` functions create Iov instances, but include various configurable parameters due to compatibility considerations.
#### Syntax
```c
int hf3fs_iovcreate2(struct hf3fs_iov *iov,
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa,
const char *shm_path);
```
#### Parameters
- **iov**: Address for Iov.
- **hf3fs_mount_point**: Mount point for 3FS. This parameter is used to distinguish 3FS clusters, enabling a single machine to mount multiple 3FS instances.
- **size**: Shared memory size for this Iov.
- **block_size**: If not `0`, this function will allocate multiple shared memory blocks, each sized no larger than `block_size`. `0` for allocate a single large shared memory. All IOs on this Iov should not span across the block margin. This parameter is for optimization on IB register time.
- **numa**: Numa ID for Ior shared memory. `-1` for current process numa ID.
- **shm_path**: Path of `tmpfs` mount point for shared memory allocation. If `nullptr`, use the default `/dev/shm`.
#### Return Value
- If success, return 0.
- If fail, return `-errno`.
#### Example
```c
struct hf3fs_iov iov;
hf3fs_iovcreate2(&iov, "/hf3fs/mount/point", 1 << 30, 0, -1, nullptr);
hf3fs_iovdestroy(&iov);
```
### hf3fs_iovdestroy
#### Summary
Destroy an Iov.
#### Syntax
```c
void hf3fs_iovdestroy(struct hf3fs_iov *iov);
```
#### Parameters
- **param**: Address for Iov.
### hf3fs_reg_fd
#### Summary
Register a file descriptor for FUSE IO.
#### Syntax
```c
int hf3fs_reg_fd(int fd, uint64_t flags);
```
#### Parameters
- **fd**: A Linux file descriptor.
- **flags**: Unused. For future use.
#### Return Value
- If success, return an integer less or equal than 0. This integer can be used in `hf3fs_prep_io` as `fd`. You can view this as an extra `fd` which is only usable in USRBIO API, and `hf3fs_prep_io` will accept both this new `fd` or the original Linux `fd`.
- If fail, return `errno`.
### hf3fs_dereg_fd
#### Summary
Deegister a file descriptor.
#### Syntax
```c
void hf3fs_dereg_fd(int fd);
```
#### Parameters
- **fd**: A Linux file descriptor.
#### Example
```c
int fd = open("example.txt", O_RDONLY);
hf3fs_reg_fd(fd, 0);
hf3fs_dereg_fd(fd);
close(fd);
```
### hf3fs_prep_io
#### Summary
Submit an I/O request to an Ior.
#### Syntax
```c
int hf3fs_prep_io(struct hf3fs_ior *ior,
const struct hf3fs_iov *iov,
bool read,
void *ptr,
int fd,
size_t off,
uint64_t len,
void *userdata);
```
#### Parameters
- **ior**: Address for Ior.
- **iov**: Address for Iov.
- **read**: `true` for read, `false` for write. Must match the Ior create parameters.
- **ptr**: The address for I/O operation. `[ptr, ptr + len)` must be fully in the range provided by the Iov.
- **fd**: File for I/O operation. Must be registered by `hf3fs_reg_fd`.
- **off**: Offset in file.
- **len**: Read size or write size.
- **userdata**: Arbitrary data which will returned by `hf3fs_wait_for_ios`.
#### Return Value
- If success, return the index of I/O request in the Ior.
- If fail, return `-errno`.
#### Notes
- This function may not be thread safe.
### hf3fs_submit_ios
#### Summary
Notify FUSE process that new I/O operations has been submitted.
#### Syntax
```c
int hf3fs_submit_ios(const struct hf3fs_ior *ior);
```
#### Parameters
- **ior**: Address for Ior.
#### Return Value
- If success, return 0.
- If fail, return `-errno`.
#### Notes
- The I/O operations may be executed **before** you call `hf3fs_submit_ios`. This function is just notifying FUSE process to work, but the FUSE process also scan new operations periodically.
### hf3fs_wait_for_ios
#### Summary
Wait and get results for completed I/O operations.
#### Syntax
```c
int hf3fs_wait_for_ios(const struct hf3fs_ior *ior,
struct hf3fs_cqe *cqes,
int cqec,
int min_results,
const struct timespec *abs_timeout);
```
#### Parameters
- **ior**: Address for Ior.
- **cqes**: Address for `hf3fs_cqe`s. This will contains I/O operation result, and `userdata` provided by `hf3fs_prep_io`.
- **cqec**: The size of array pointed by `cqes`.
- **min_results**: Minimum number of results to return.
- **abs_timeout**: Maximum timeout to return.
#### Return Value
- If success, return number of completed I/O requests.
- If fail, return `-errno`.
#### Example
```c
hf3fs_prep_io(&ior, &iov, true, iov.base, fd, 0, 4096, nullptr);
hf3fs_prep_io(&ior, &iov, true, iov.base + 4096, fd, 4096, 4096, nullptr);
hf3fs_submit_ios(&ior);
hf3fs_cqe cqes[2];
hf3fs_wait_for_ios(&ior, cqes, 2, 2, nullptr);
```
#### Notes
- It is OK to call `hf3fs_prep_io` and `hf3fs_submit_ios` in one thread, and call `hf3fs_wait_for_ios` in another thread. But only one thread can call `hf3fs_prep_io` and `hf3fs_submit_ios`, and only one thread can call `hf3fs_wait_for_ios`.
## Example
```c
#include <hf3fs_usrbio.h>
constexpr uint64_t NUM_IOS = 1024;
constexpr uint64_t BLOCK_SIZE = (32 << 20);
int main() {
struct hf3fs_ior ior;
hf3fs_iorcreate4(&ior, "/hf3fs/mount/point", NUM_IOS, true, 0, 0, -1, 0);
struct hf3fs_iov iov;
hf3fs_iovcreate2(&iov, "/hf3fs/mount/point", NUM_IOS * BLOCK_SIZE, 0, -1, nullptr);
int fd = open("/hf3fs/mount/point/example.bin", O_RDONLY);
hf3fs_reg_fd(fd, 0);
for (int i = 0; i < NUM_IOS; i++) {
hf3fs_prep_io(&ior, &iov, true, iov.base + i * BLOCK_SIZE, fd, i * BLOCK_SIZE, BLOCK_SIZE, nullptr);
}
hf3fs_submit_ios(&ior);
hf3fs_cqe cqes[NUM_IOS];
hf3fs_wait_for_ios(&ior, cqes, NUM_IOS, NUM_IOS, nullptr);
hf3fs_dereg_fd(fd);
close(fd);
hf3fs_iovdestroy(&iov);
hf3fs_iordestroy(&ior);
return 0;
}
```

58
src/lib/api/fuse.h Normal file
View File

@@ -0,0 +1,58 @@
#pragma once
#include <asm-generic/ioctl.h>
#include <cstdint>
#include <linux/limits.h>
#include <sys/ioctl.h>
#include <sys/types.h>
namespace hf3fs::lib::fuse {
#define HF3FS_IOCTYPE_ID 'h'
#define HF3FS_IOCTL_PUNCH_HOLE_MAX 1000
struct Hf3fsIoctlGetMountNameArg {
char str[32];
};
struct Hf3fsIoctlHardlinkArg {
ino_t ino;
char str[NAME_MAX];
};
struct Hf3fsIoctlPunchHoleArg {
int n;
size_t flags;
size_t start[HF3FS_IOCTL_PUNCH_HOLE_MAX];
size_t end[HF3FS_IOCTL_PUNCH_HOLE_MAX];
};
struct Hf3fsIoctlMove {
uint64_t srcParent;
char srcName[NAME_MAX + 1];
uint64_t dstParent;
char dstName[NAME_MAX + 1];
bool moveToTrash;
};
struct Hf3fsIoctlRemove {
uint64_t parent;
char name[NAME_MAX + 1];
bool recursive;
};
enum {
HF3FS_IOC_GET_MOUNT_NAME = _IOR(HF3FS_IOCTYPE_ID, 0, Hf3fsIoctlGetMountNameArg),
HF3FS_IOC_GET_PATH_OFFSET = _IOR(HF3FS_IOCTYPE_ID, 1, uint32_t),
HF3FS_IOC_GET_MAGIC_NUM = _IOR(HF3FS_IOCTYPE_ID, 2, uint32_t),
HF3FS_IOC_GET_IOCTL_VERSION = _IOR(HF3FS_IOCTYPE_ID, 3, uint32_t),
HF3FS_IOC_RECURSIVE_RM = _IOR(HF3FS_IOCTYPE_ID, 10, uint32_t),
HF3FS_IOC_FSYNC = _IOR(HF3FS_IOCTYPE_ID, 11, uint32_t),
HF3FS_IOC_HARDLINK = _IOW(HF3FS_IOCTYPE_ID, 12, Hf3fsIoctlHardlinkArg),
HF3FS_IOC_PUNCH_HOLE = _IOW(HF3FS_IOCTYPE_ID, 13, Hf3fsIoctlPunchHoleArg),
HF3FS_IOC_MOVE = _IOW(HF3FS_IOCTYPE_ID, 14, Hf3fsIoctlMove),
HF3FS_IOC_REMOVE = _IOW(HF3FS_IOCTYPE_ID, 15, Hf3fsIoctlRemove),
};
} // namespace hf3fs::lib::fuse

174
src/lib/api/hf3fs.h Normal file
View File

@@ -0,0 +1,174 @@
#pragma once
#include <dirent.h>
#include <memory>
#include <optional>
#include <string>
#include <sys/stat.h>
#include <sys/vfs.h>
#include "hf3fs_expected.h"
#define HF3FS_SUPER_MAGIC 0x8f3f5fff // hf3fs fff
namespace hf3fs::lib {
template <typename T>
using Result = nonstd::expected<T, std::pair<int, std::string>>;
struct Empty {};
using NoResult = Result<Empty>;
struct DIR;
struct dirent {
unsigned char d_type;
std::string d_name;
};
struct stat : public ::stat {
std::optional<std::string> st_ltarg; // symlink target
};
struct ioseg {
int fd = -1;
off_t off = -1;
};
struct iovec_handle_t;
struct iovec {
// base/len can be different than returned by iovalloc, so long as the whole buffer is within the range
void *iov_base;
size_t iov_len;
// returned by iovalloc
std::shared_ptr<iovec_handle_t> iov_handle;
void *user_data = nullptr;
};
class IClient {
public:
static Result<std::shared_ptr<IClient>> newClient(std::string_view mountName, std::string_view token);
static Result<std::shared_ptr<IClient>> newSuperClient(std::string_view mountName, std::string_view token);
IClient() = default;
virtual ~IClient() = default;
public: // meta ops
virtual NoResult statfs(std::string_view path, struct statfs *buf) = 0;
virtual NoResult fstatfs(int fd, struct statfs *buf) = 0;
virtual Result<std::shared_ptr<DIR>> opendir(std::string_view name, bool ignoreCache = false) = 0;
virtual Result<std::shared_ptr<DIR>> opendirat(int dirfd, std::string_view name, bool ignoreCache = false) = 0;
virtual Result<std::shared_ptr<DIR>> fdopendir(int fd) = 0;
virtual NoResult rewinddir(const std::shared_ptr<DIR> &dirp) = 0;
virtual Result<std::optional<dirent>> readdir(const std::shared_ptr<DIR> &dirp) = 0;
virtual Result<int> dirfd(const std::shared_ptr<DIR> &dirp) = 0;
virtual NoResult mkdir(std::string_view pathname, mode_t mode = 0777, bool recursive = false) = 0;
virtual NoResult mkdirat(int dirfd, std::string_view pathname, mode_t mode = 0777, bool recursive = false) = 0;
virtual NoResult rmdir(std::string_view pathname, bool recursive = false) = 0;
virtual NoResult rmdirat(int dirfd, std::string_view pathname, bool recursive = false) = 0;
virtual NoResult unlink(std::string_view pathname) = 0;
virtual NoResult unlinkat(int dirfd, std::string_view pathname, int flags, bool recursive = false) = 0;
virtual NoResult remove(std::string_view pathname,
std::optional<bool> isDir = std::nullopt,
bool recursive = false) = 0;
virtual NoResult removeat(int dirfd,
std::string_view pathname,
std::optional<bool> isDir = std::nullopt,
bool recursive = false) = 0;
virtual NoResult rename(std::string_view oldpath, std::string_view newpath) = 0;
virtual NoResult renameat(int olddirfd, std::string_view oldpath, int newdirfd, std::string_view newpath) = 0;
virtual NoResult stat(std::string_view pathname, struct stat *statbuf, bool ignoreCache = false) = 0;
virtual NoResult fstat(int fd, struct stat *statbuf) = 0;
virtual NoResult lstat(std::string_view pathname, struct stat *statbuf, bool ignoreCache = false) = 0;
virtual NoResult fstatat(int dirfd, std::string_view pathname, struct stat *statbuf, int flags) = 0;
virtual Result<std::string> readlink(std::string_view pathname, bool ignoreCache = false) = 0;
virtual Result<std::string> readlinkat(int dirfd, std::string_view pathname, bool ignoreCache = false) = 0;
virtual Result<std::string> realpath(std::string_view pathname, bool absolute = true) = 0;
virtual Result<std::string> realpathat(int dirfd, std::string_view pathname, bool absolute = true) = 0;
virtual Result<int> dup(int oldfd) = 0;
virtual Result<int> dup2(int oldfd, int newfd) = 0;
virtual Result<int> dup3(int oldfd, int newfd, int flags) = 0;
// NOTE: we don't use umask to modify mode when open()/creat() new files
virtual Result<int> creat(std::string_view pathname,
mode_t mode = 0666,
bool excl = false,
bool ignoreCache = false) = 0;
virtual Result<int> creatat(int dirfd,
std::string_view pathname,
mode_t mode = 0666,
bool excl = false,
bool ignoreCache = false) = 0;
virtual NoResult symlink(std::string_view target, std::string_view linkpath) = 0;
virtual NoResult symlinkat(std::string_view target, int newdirfd, std::string_view linkpath) = 0;
virtual NoResult link(std::string_view oldpath, std::string_view newpath) = 0;
virtual NoResult linkat(int olddirfd,
std::string_view oldpath,
int newdirfd,
std::string_view newpath,
int flags) = 0;
// we use O_NONBLOCK flag to indicate we want to ignore the inode cache
// if you want to read file immediately after operating on it, use this flag
virtual Result<int> open(std::string_view pathname, int flags, mode_t mode = 0666) = 0;
virtual Result<int> openat(int dirfd, std::string_view pathname, int flags, mode_t mode = 0666) = 0;
virtual NoResult close(int fd) = 0;
virtual NoResult utimes(std::string_view filename, const struct timeval times[2]) = 0;
virtual NoResult futimens(int fd, const struct timespec times[2]) = 0;
virtual NoResult utimensat(int dirfd, std::string_view pathname, const struct timespec times[2], int flags) = 0;
virtual NoResult chmod(std::string_view pathname, mode_t mode) = 0;
// virtual NoResult fchmod(int fd, mode_t mode) = 0;
virtual NoResult fchmodat(int dirfd, std::string_view pathname, mode_t mode, int flags) = 0;
virtual NoResult chown(std::string_view pathname, uid_t owner, gid_t group) = 0;
// virtual NoResult fchown(int fd, uid_t owner, gid_t group) = 0;
virtual NoResult lchown(std::string_view pathname, uid_t owner, gid_t group) = 0;
virtual NoResult fchownat(int dirfd, std::string_view pathname, uid_t owner, gid_t group, int flags) = 0;
virtual NoResult chdir(std::string_view path, bool ignoreCache = false) = 0;
virtual NoResult fchdir(int fd) = 0;
virtual Result<std::string> getcwd() = 0;
public: // io ops
virtual Result<struct iovec> iovalloc(size_t bytes, int numa = -1, bool global = false, size_t blockSize = 0) = 0;
virtual NoResult iovfree(const std::shared_ptr<iovec_handle_t> &iovh) = 0;
// has to use iovec returned by iovalloc, zero copy
virtual NoResult preadv(int iovcnt, const struct iovec *iov, const struct ioseg *segv, ssize_t *resv) = 0;
virtual NoResult pwritev(int iovcnt, const struct iovec *iov, const struct ioseg *segv, ssize_t *resv) = 0;
// can use buffer allocated by caller, not zero copy, offset is maintained by client
virtual Result<size_t> read(int fd, void *buf, size_t count, size_t readahead = 0) = 0;
virtual Result<size_t> write(int fd, const void *buf, size_t count, bool flush = false) = 0;
// may not be very accurate if seek from end,
// since other clients may be writing and moving the eof when we're seeking
virtual Result<off_t> lseek(int fd, off_t offset, int whence, size_t readahead = 0) = 0;
virtual NoResult fdatasync(int fd) = 0;
virtual NoResult fsync(int fd) = 0;
virtual NoResult ftruncate(int fd, off_t length) = 0;
public: // advanced ops
// caller can distribute the serialized sharedFileHandles to clients on other machines
// and call openWithFileHandles() there to get fds without server communication
// only fds opened for read/path/dirfd can be used this way, the root dir cannot though
// used to reduce calls to the meta server, thus improve perf
// how to distribute the bytes is not in consideration of the hf3fs client
virtual Result<std::vector<uint8_t>> sharedFileHandles(const std::vector<int> &fds) = 0;
// can return fewer than fhs, if too many files are opened
virtual Result<std::vector<int>> openWithFileHandles(const std::vector<uint8_t> &fhs) = 0;
virtual Result<std::string> sharedIovecHandle(const std::shared_ptr<iovec_handle_t> &iovh) = 0;
virtual Result<struct iovec> openIovecHandle(const std::string &iovh, bool acrossAgent = false) = 0;
};
} // namespace hf3fs::lib

2031
src/lib/api/hf3fs_expected.h Normal file

File diff suppressed because it is too large Load Diff

172
src/lib/api/hf3fs_usrbio.h Normal file
View File

@@ -0,0 +1,172 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include <sys/types.h>
#define HF3FS_SUPER_MAGIC 0x8f3f5fff // hf3fs fff
typedef void *hf3fs_iov_handle;
// for data src/dst when writing to/reading from hf3fs storage
// also as the base buffer for ior
// however, if you already has a shared buffer, skip hf3fs_iovwrap() and go for hf3fs_iorwrap() directly
struct hf3fs_iov {
uint8_t *base;
hf3fs_iov_handle iovh;
char id[16];
char mount_point[256];
size_t size;
size_t block_size;
int numa;
};
typedef void *hf3fs_ior_handle;
// for submitting ios to/reaping results from hf3fs fuse
struct hf3fs_ior {
struct hf3fs_iov iov;
hf3fs_ior_handle iorh;
char mount_point[256];
bool for_read;
// io_depth > 0 to make the io worker to process io_depth ios each time
// say when reading exactly one sample batch for training
// == 0 to process all the prepared ios ASAP
// notice that hf3fs_submit_ios() is just a last resort hint to the io worker
// which may process prepared ios before they're submitted so long as it discovers them
// < 0 means to process ios ASAP but not more than -io_depth ios each time
// in case there are too many ios to finish in a reasonable time
int io_depth;
int priority;
int timeout;
uint64_t flags;
};
struct hf3fs_cqe {
int32_t index;
int32_t reserved;
int64_t result;
const void *userdata;
};
// 1 for yes, 0 for no
bool hf3fs_is_hf3fs(int fd);
// returns length of the mount point + 1 if it is a valid path on an hf3fs instance
// return -1 if it is not a valid path on an hf3fs instance
// if the returned size is larger than the size in args,
// the passed-in hf3fs_mount_point buffer is not long enough
int hf3fs_extract_mount_point(char *hf3fs_mount_point, int size, const char *path);
// 0 for success, -errno for error
// iov ptr itself should be allocated by caller, it could be on either stack or heap
// the pointer hf3fs_mount_point will be copied into the corresponding field in hf3fs_iov
// it should not be too long
int hf3fs_iovcreate(struct hf3fs_iov *iov, const char *hf3fs_mount_point, size_t size, size_t block_size, int numa);
int hf3fs_iovopen(struct hf3fs_iov *iov,
const uint8_t id[16],
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa);
void hf3fs_iovunlink(struct hf3fs_iov *iov);
// iov ptr itself will not be freed
void hf3fs_iovdestroy(struct hf3fs_iov *iov);
// the iovalloc is actually creating a shm and symlink into a virtual dir in hf3fs
// so the user may want to wrap an already registered shm (for any reason)
// iov ptr itself should be allocated by caller
// the wrapped iov cannot be destroyed by hf3fs_iovdestroy()
// the underlying base ptr should not be unmapped when the iov (or its corresponding ior) is still being used
int hf3fs_iovwrap(struct hf3fs_iov *iov,
void *base,
const uint8_t id[16],
const char *hf3fs_mount_point,
size_t size,
size_t block_size,
int numa);
// calculate required memory size with wanted entries
// the calculated size can be used to create the underlying iov
size_t hf3fs_ior_size(int entries);
// ior ptr itself should be allocated by caller, on either stack or heap
int hf3fs_iorcreate(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int numa);
int hf3fs_iorcreate2(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int priority,
int numa);
int hf3fs_iorcreate3(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int priority,
int timeout,
int numa);
#define HF3FS_IOR_ALLOW_READ_UNCOMMITTED 1
#define HF3FS_IOR_FORBID_READ_HOLES 2
int hf3fs_iorcreate4(struct hf3fs_ior *ior,
const char *hf3fs_mount_point,
int entries,
bool for_read,
int io_depth,
int timeout,
int numa,
uint64_t flags);
// ior ptr itself will not be freed
void hf3fs_iordestroy(struct hf3fs_ior *ior);
// <= 0 for io-preppable file handle, errno for error
// fd has to be registered before used in hf3fs_prep_io()
// registered fds should not be closed, and even if it's closed, the old inode will still be used to prep io
// also, if a registered fd is closed, and a new fd with the same integer value is to be registered
// the registration will fail with an EINVAL
int hf3fs_reg_fd(int fd, uint64_t flags);
void hf3fs_dereg_fd(int fd);
// report max number of entries in the ioring
int hf3fs_io_entries(const struct hf3fs_ior *ior);
// >= 0 for io index, -errno for error
// this functioon is *NOT* thread safe!!!!!
// do not prepare io in the same ioring from different threads
// or the batches may be mixed and things may get ugly for *YOU*
// with such assumption, we don't waste time for the thread-safety
int hf3fs_prep_io(const struct hf3fs_ior *ior,
const struct hf3fs_iov *iov,
bool read,
void *ptr,
int fd,
size_t off,
uint64_t len,
const void *userdata);
// 0 for success, -errno for error
int hf3fs_submit_ios(const struct hf3fs_ior *ior);
// >= 0 for result count, -errno for error, may return fewer than ready, call again to make sure
int hf3fs_wait_for_ios(const struct hf3fs_ior *ior,
struct hf3fs_cqe *cqes,
int cqec,
int min_results,
const struct timespec *abs_timeout);
int hf3fs_hardlink(const char *target, const char *link_name);
int hf3fs_punchhole(int fd, int n, const size_t *start, const size_t *end, size_t flags);
#ifdef __cplusplus
}
#endif

View File

@@ -0,0 +1 @@
target_add_lib(client-lib-common common numa rt)

View File

@@ -0,0 +1,225 @@
#pragma once
#include <folly/concurrency/AtomicSharedPtr.h>
#include <memory>
#include <ranges>
#include <shared_mutex>
#include <vector>
#include "common/utils/Result.h"
#include "common/utils/RobinHood.h"
namespace hf3fs::lib {
template <typename T, uint32_t KeyError, uint32_t OverflowError>
class PerProcTable {
public:
using Item = T;
using ItemPtr = std::shared_ptr<Item>;
PerProcTable() = delete;
PerProcTable(int p, int pp, size_t cap)
: pid_(p),
ppid_(pp),
table_(cap) {
free_.reserve(cap);
}
PerProcTable(const PerProcTable &rhs, int p, int pp, size_t cap)
: pid_(p),
ppid_(pp) {
if (ppid_ == rhs.pid_) {
{
std::shared_lock lock(rhs.mtx_);
std::vector<AtomicItemPtr> t(std::max(rhs.table_.size(), cap));
nextAvail_ = rhs.nextAvail_;
for (int i = 0; i < nextAvail_; ++i) {
t[i] = rhs.table_[i].load();
}
swap(t, table_);
free_ = rhs.free_;
}
if (table_.size() < cap) { // we don't shrink the table or items may drop out
free_.reserve(cap);
}
} else { // rhs is not our parent proc, so do not copy its fd table
std::vector<AtomicItemPtr> t(cap);
swap(t, table_);
}
}
int pid() const { return pid_; }
int ppid() const { return ppid_; }
hf3fs::Result<ItemPtr> at(int idx, bool check = true) {
if (idx < 0) {
XLOGF(DBG, "bad index {}", idx);
return makeError(KeyError,
fmt::format("invalid index {} to lookup in table for proc {} parent {}", idx, pid_, ppid_));
} else {
{
// std::shared_lock lock(mtx_);
if (table_.size() < (size_t)idx) {
XLOGF(DBG, "invalid index {}", idx);
return makeError(KeyError, fmt::format("invalid index {} in table for proc {} parent {}", idx, pid_, ppid_));
} else {
auto p = table_[idx].load();
if (check && !p) {
XLOGF(DBG, "no item at index {}", idx);
return makeError(KeyError,
fmt::format("index {} not found in table for proc {} parent {}", idx, pid_, ppid_));
}
return p;
}
}
}
}
void setAt(int idx, const ItemPtr &v) {
std::unique_lock lock(mtx_);
auto p = table_[idx].load();
XLOGF(DBG,
"before set at idx {} table idx {} next avail {} free size {}",
idx,
(void *)p.get(),
nextAvail_,
free_.size());
if (!p) { // add to specific place
if (idx >= nextAvail_) {
for (auto i = nextAvail_; i < idx; ++i) {
free_.push_back(i);
}
std::ranges::make_heap(free_, std::greater<int>());
nextAvail_ = idx + 1;
} else {
if (free_.size() == 1) {
assert(free_[0] == idx);
free_.clear();
} else {
*std::ranges::find(free_, idx) = free_.back();
free_.pop_back();
std::ranges::make_heap(free_, std::greater<int>());
}
}
}
table_[idx] = v;
}
hf3fs::Result<int> add(const ItemPtr &v) {
std::unique_lock lock(mtx_);
auto useFree = !free_.empty();
if (!useFree && (size_t)nextAvail_ >= table_.size()) {
return makeError(OverflowError,
fmt::format("no free slot in table for proc {} parent {} table size {} next avail {}",
pid_,
ppid_,
table_.size(),
nextAvail_));
}
auto idx = useFree ? free_.front() : nextAvail_++;
if (useFree) {
std::ranges::pop_heap(free_, std::greater<int>());
free_.pop_back();
}
auto p = table_[idx].load();
XLOGF(DBG,
"idx {} use free {} next avail {} free size {} table idx {}",
idx,
useFree,
nextAvail_,
free_.size(),
(void *)p.get());
assert(!p);
table_[idx] = v;
return idx;
}
void resetAt(int idx) {
std::unique_lock lock(mtx_);
table_[idx] = ItemPtr{};
free_.push_back(idx);
std::ranges::push_heap(free_, std::greater<int>());
XLOGF(DBG, "reset idx {} next avail {} free size {} first free {}", idx, nextAvail_, free_.size(), free_.front());
}
size_t size() const {
std::shared_lock lock(mtx_);
return (size_t)nextAvail_ - free_.size();
}
std::vector<int> allUsed() const {
std::vector<int> used;
used.reserve(table_.size());
std::shared_lock lock(mtx_);
for (int i = 1; i < (int)table_.size(); ++i) {
if (table_[i].load()) {
used.push_back(i);
}
}
return used;
}
std::unique_lock<std::shared_mutex> lock() const { return std::unique_lock<std::shared_mutex>(mtx_); }
private:
int pid_;
int ppid_;
mutable std::shared_mutex mtx_;
using AtomicItemPtr = folly::atomic_shared_ptr<Item>;
std::vector<AtomicItemPtr> table_;
int nextAvail_ = 0;
std::vector<int> free_;
};
template <typename T, uint32_t KeyError, uint32_t OverflowError>
class AllProcMap {
public:
using Table = PerProcTable<T, KeyError, OverflowError>;
using TablePtr = std::shared_ptr<Table>;
using ItemPtr = typename Table::ItemPtr;
TablePtr procTable(int pid, int ppid, size_t cap) {
TablePtr parentTable;
{
std::shared_lock lock(mtx_);
auto it = map_.find(pid);
if (it != map_.end()) {
if (ppid == it->second->ppid()) {
return it->second;
} // else, same pid but diff proc, old proc must have died
} else {
auto it = map_.find(ppid);
if (it != map_.end()) {
parentTable = it->second;
}
}
}
TablePtr newTable;
if (parentTable) {
newTable = std::make_shared<Table>(*parentTable, pid, ppid, cap);
} else {
newTable = std::make_shared<Table>(pid, ppid, cap);
}
std::unique_lock lock(mtx_);
return map_[pid] = std::move(newTable);
}
void removeProc(int pid) {
std::unique_lock lock(mtx_);
map_.erase(pid);
}
std::vector<std::pair<int, int>> allProcs() const {
std::vector<std::pair<int, int>> procs;
std::shared_lock lock(mtx_);
procs.reserve(map_.size());
for (auto &&p : map_) {
if (p.first != 0) {
procs.emplace_back(std::make_pair(p.first, p.second->ppid()));
}
}
return procs;
}
private:
mutable std::shared_mutex mtx_;
robin_hood::unordered_map<int, TablePtr> map_;
};
} // namespace hf3fs::lib

176
src/lib/common/Shm.cc Normal file
View File

@@ -0,0 +1,176 @@
#include "Shm.h"
#include <fcntl.h>
#include <folly/experimental/coro/BlockingWait.h>
#include <numa.h>
#include <sys/mman.h>
#include <unistd.h>
namespace hf3fs::lib {
ShmBuf::ShmBuf(const Path &p, size_t sz, size_t bsz, int numa, meta::Uid u, int pr, int pp)
: path(p),
size(sz),
blockSize(bsz ? bsz : sz),
off(0),
user(u),
pid(pr),
ppid(pp),
id(Uuid::random()),
owner_(true),
numaNode_(numa),
memhs_((size + blockSize - 1) / blockSize) {
if (bsz < 0 || bsz > sz) {
throw std::invalid_argument(fmt::format("invalid block size {} total bytes {}", bsz, sz));
}
mapBuf();
if (numaNode_ >= 0) {
numa_tonode_memory(bufStart, size, numaNode_);
}
}
ShmBuf::ShmBuf(const Path &p, off_t o, size_t sz, size_t bsz, Uuid u)
: path(p),
size(sz),
blockSize(bsz ? bsz : sz),
off(o),
// user(0),
id(u),
owner_(false),
numaNode_(-1),
memhs_((size + blockSize - 1) / blockSize) {
if (bsz < 0 || bsz > sz) {
throw std::invalid_argument(fmt::format("invalid block size {} total bytes {}", bsz, sz));
}
XLOGF(DBG, "this {} buf start {} off {}", (void *)this, (void *)bufStart, off);
mapBuf();
XLOGF(DBG, "this {} buf start {} off {}", (void *)this, (void *)bufStart, off);
}
ShmBuf::~ShmBuf() {
XLOGF(DBG, "calling dtor of shm {}", (void *)this);
folly::coro::blockingWait(deregisterForIO());
// if (owner_) {
// auto res = ftruncate(fd_, 0);
// XLOGF_IF(ERR, res < 0, "ftruncate shm file to 0 failed {}", res);
// }
unmapBuf();
}
CoTask<void> ShmBuf::deregisterForIO() {
XLOGF(DBG, "regging {} memhs {}", regging_.load(), memhs_.size());
if (!regging_) {
co_return;
}
if (!memhs_[0].load()) {
co_await memhBaton_;
}
regging_ = false;
for (auto &memh : memhs_) {
memh.store(std::shared_ptr<storage::client::IOBuffer>());
}
co_return;
}
CoTask<void> ShmBuf::registerForIO(folly::Executor::KeepAlive<> exec,
storage::client::StorageClient &sc,
std::function<void()> &&recordMetrics) {
if (regging_) {
co_await memhBaton_;
}
regging_ = true;
memhBaton_.reset();
for (auto &memh : memhs_) {
memh.store(std::shared_ptr<storage::client::IOBuffer>());
}
auto f = [this, &sc, recordMetrics = std::move(recordMetrics)]() {
for (size_t i = 0; i < memhs_.size(); ++i) {
auto res = sc.registerIOBuffer(bufStart + blockSize * i, std::min(size - blockSize * i, blockSize));
if (!res) {
XLOGF(ERR,
"failed to register buffer @{} seg #{} with bytes {} block size {} code {} msg {}",
(void *)bufStart,
i,
size,
blockSize,
res.error().code(),
res.error().message());
} else {
memhs_[i].store(std::make_shared<storage::client::IOBuffer>(std::move(*res)));
recordMetrics();
}
}
memhBaton_.post();
};
if (exec) {
exec.get()->add(std::move(f));
} else {
f();
}
}
void ShmBuf::unmapBuf() {
if (bufStart) {
munmap(bufStart, size);
if (owner_) {
shm_unlink(path.c_str());
}
bufStart = nullptr;
}
}
CoTask<std::shared_ptr<storage::client::IOBuffer>> ShmBuf::memh(size_t off) {
auto idx = off / blockSize;
if (regging_ && !memhs_[idx].load()) {
co_await memhBaton_;
}
co_return memhs_[idx].load();
}
void ShmBuf::mapBuf() {
auto fd = shm_open(path.c_str(), O_RDWR | (owner_ ? O_CREAT | O_EXCL : 0), 0666);
if (fd < 0) {
auto err = errno;
throw std::runtime_error(fmt::format("failed to shm_open {} is owner {} errno {} euid {} uid {}",
path.native(),
owner_,
err,
geteuid(),
getuid()));
}
SCOPE_EXIT { close(fd); };
if (owner_) {
auto res = ftruncate(fd, size);
if (res < 0) {
throw std::runtime_error("failed to ftruncate");
}
}
bufStart = (uint8_t *)mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, off);
if (bufStart == MAP_FAILED) {
throw std::runtime_error("failed to mmap");
}
XLOGF(DBG, "this {} owner {} buf start {} size {} off {}", (void *)this, owner_, (void *)bufStart, size, off);
}
} // namespace hf3fs::lib

102
src/lib/common/Shm.h Normal file
View File

@@ -0,0 +1,102 @@
#pragma once
#include <forward_list>
#include <shared_mutex>
#include "PerProcTable.h"
#include "client/storage/StorageClient.h"
#include "common/utils/Path.h"
#include "fbs/meta/Schema.h"
namespace hf3fs::lib {
struct IorAttrs {
int priority = 1;
Duration timeout{Duration::zero()};
uint64_t flags = 0;
};
struct ShmBuf {
ShmBuf(const Path &p, size_t sz, size_t bsz, int numa, meta::Uid u, int pid, int ppid);
ShmBuf(const Path &p, off_t o, size_t sz, size_t bsz, Uuid u);
~ShmBuf();
CoTask<void> registerForIO(folly::Executor::KeepAlive<> exec,
storage::client::StorageClient &sc,
std::function<void()> &&recordMetrics);
CoTask<std::shared_ptr<storage::client::IOBuffer>> memh(size_t off);
bool checkId(const Uuid &uid) const { return id == uid; }
CoTask<void> deregisterForIO();
void unmapBuf();
bool maybeUnlinkShm() {
if (owner_) {
shm_unlink(path.c_str());
}
return owner_;
}
Path path;
uint8_t *bufStart{nullptr};
size_t size{0};
size_t blockSize{0};
// for client lib
off_t off;
// for global shm to do acl
meta::Uid user{0};
// for global shm to be freed after owning process is gone
int pid{0};
int ppid{0};
Uuid id;
// for fuse
std::string key;
int iorIndex = -1;
bool isIoRing = false;
bool forRead = true;
int ioDepth = 0;
std::optional<IorAttrs> iora;
private:
void mapBuf();
private:
bool owner_;
int numaNode_;
// int fd_;
// for client agent
std::vector<folly::atomic_shared_ptr<storage::client::IOBuffer>> memhs_;
folly::coro::Baton memhBaton_;
std::atomic<bool> regging_;
};
class ShmBufForIO {
public:
ShmBufForIO(std::shared_ptr<ShmBuf> buf, off_t off)
: buf_(std::move(buf)),
off_(off) {}
uint8_t *ptr() const {
XLOGF(DBG, "buf start {} off {} ptr {}", (void *)buf_->bufStart, off_, (void *)(buf_->bufStart + off_));
return buf_->bufStart + off_;
}
CoTryTask<storage::client::IOBuffer *> memh(size_t len) const {
XLOGF(DBG, "shm buf for io off {} buf ptr {}", off_, (void *)buf_.get());
XLOGF(DBG, "shm block size {}", buf_->blockSize);
if (len && off_ / buf_->blockSize != (off_ + len - 1) / buf_->blockSize) {
co_return makeError(StatusCode::kInvalidArg);
}
co_return (co_await buf_->memh(off_)).get();
}
private:
std::shared_ptr<ShmBuf> buf_;
off_t off_;
};
using ProcShmBuf = AllProcMap<ShmBuf, StatusCode::kInvalidArg, StatusCode::kNotEnoughMemory>;
} // namespace hf3fs::lib

5
src/lib/common/paths.h Normal file
View File

@@ -0,0 +1,5 @@
#pragma once
namespace hf3fs::lib::agent {
static const char *varTmpPath = "/var/tmp/hf3fs_client_agent";
}

14
src/lib/py/CMakeLists.txt Normal file
View File

@@ -0,0 +1,14 @@
pybind11_add_module(hf3fs_py_usrbio usrbio_binding.cc)
target_compile_definitions(hf3fs_py_usrbio
PRIVATE) # VERSION_INFO=${PYCLIENT_VERSION_INFO})
target_include_directories(hf3fs_py_usrbio
PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
${PROJECT_SOURCE_DIR}
${PROJECT_BINARY_DIR}/src
${PROJECT_BINARY_DIR}
)
target_link_libraries(hf3fs_py_usrbio PRIVATE hf3fs_api_shared)

496
src/lib/py/binding.cc Normal file
View File

@@ -0,0 +1,496 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <stdexcept>
#include "lib/api/Client.h"
namespace hl = hf3fs::lib;
namespace py = pybind11;
using py_iovec = std::tuple<hl::iovec, int, off_t>;
struct OSException {
int errcode;
std::string filename;
};
template <typename T>
static std::conditional_t<std::is_same_v<T, hl::Empty>, void, T> unwrap(hl::Result<T> r) {
if (!r) {
// throw std::runtime_error(std::string("errno ") + std::to_string(r.error()));
throw OSException{r.error().first, r.error().second};
// PyErr_SetFromErrnoWithFilename(PyExc_OSError, r.error().second.c_str());
} else if constexpr (std::is_same_v<T, hl::Empty>) {
return;
} else {
return r.value();
}
}
static int64_t makeTimeNs(const timespec &t) { return t.tv_sec * 1'000'000'000ll + t.tv_nsec; }
PYBIND11_MODULE(hf3fs_py_usrbio, m) {
py::register_exception_translator([](std::exception_ptr p) {
try {
if (p) std::rethrow_exception(p);
} catch (const OSException &e) {
errno = e.errcode;
PyErr_SetFromErrnoWithFilename(PyExc_OSError, e.filename.c_str());
}
});
m.attr("HF3FS_SUPER_MAGIC") = py::int_(uint32_t(HF3FS_SUPER_MAGIC));
// m.attr("HF3FS_IOC_GET_MOUNT_NAME") = py::int_(uint32_t(hl::HF3FS_IOC_GET_MOUNT_NAME));
// m.attr("HF3FS_IOC_GET_PATH_OFFSET") = py::int_(uint32_t(hl::HF3FS_IOC_GET_PATH_OFFSET));
// m.attr("HF3FS_IOC_GET_MAGIC_NUM") = py::int_(uint32_t(hl::HF3FS_IOC_GET_MAGIC_NUM));
// m.def("init", &hl::initLib);
py::class_<hl::DIR, std::shared_ptr<hl::DIR>>(m, "_DIR");
py::class_<hl::dirent>(m, "dirent")
.def_readonly("d_type", &hl::dirent::d_type)
.def_readonly("d_name", &hl::dirent::d_name);
py::class_<hl::stat>(m, "stat_result")
.def_readonly("st_mode", &hl::stat::st_mode)
.def_readonly("st_nlink", &hl::stat::st_nlink)
.def_readonly("st_uid", &hl::stat::st_uid)
.def_readonly("st_gid", &hl::stat::st_gid)
.def_readonly("st_size", &hl::stat::st_size)
.def_property_readonly("st_atime", [](const hl::stat &st) { return st.st_atim.tv_sec; })
.def_property_readonly("st_atime_ns", [](const hl::stat &st) { return makeTimeNs(st.st_atim); })
.def_property_readonly("st_mtime", [](const hl::stat &st) { return st.st_mtim.tv_sec; })
.def_property_readonly("st_mtime_ns", [](const hl::stat &st) { return makeTimeNs(st.st_mtim); })
.def_property_readonly("st_ctime", [](const hl::stat &st) { return st.st_ctim.tv_sec; })
.def_property_readonly("st_ctime_ns", [](const hl::stat &st) { return makeTimeNs(st.st_ctim); })
.def_readonly("st_ltarg", &hl::stat::st_ltarg)
.def("is_file", [](const hl::stat &st) { return (st.st_mode & S_IFMT) == S_IFREG; })
.def("is_dir", [](const hl::stat &st) { return (st.st_mode & S_IFMT) == S_IFDIR; })
.def("is_link", [](const hl::stat &st) { return (st.st_mode & S_IFMT) == S_IFLNK; });
py::class_<hl::iovec>(m, "iovec", py::buffer_protocol(), R"(
buffer protocol
iovalloc
)")
.def_buffer([](hl::iovec &iov) -> py::buffer_info {
return py::buffer_info(iov.iov_base, 1, py::format_descriptor<uint8_t>::format(), 1, {iov.iov_len}, {1});
})
.def(py::init([](const hl::iovec &other, std::optional<py::buffer> buf) {
auto bufInfo = buf->request(true);
auto ptr = (uint8_t *)bufInfo.ptr;
auto bs = bufInfo.itemsize * (uint64_t)bufInfo.size;
if (ptr < other.iov_base) {
throw std::out_of_range("given buf is out of range");
}
size_t off = (uint8_t *)ptr - (uint8_t *)other.iov_base;
size_t bytes = bs;
if (off + bytes > other.iov_len) {
throw std::out_of_range("given buf is out of range");
}
return hl::iovec{(uint8_t *)other.iov_base + off, bytes, other.iov_handle, nullptr};
}),
py::arg("other"),
py::arg("buf"),
R"(
Python buffer iovec
Args:
other: iovec
buf: Python buffer buffer iovec
)")
.def(py::init([](const hl::iovec &other, std::optional<size_t> off, std::optional<size_t> bytes) {
size_t off_value = off.value_or(0);
size_t bytes_value = bytes.value_or(other.iov_len);
if (off_value + bytes_value > other.iov_len) {
throw std::out_of_range("given off and bytes is out of range");
}
return hl::iovec{(uint8_t *)other.iov_base + off_value, bytes_value, other.iov_handle, nullptr};
}),
py::arg("other"),
py::arg("off") = std::nullopt,
py::arg("bytes") = std::nullopt,
R"(
iovec iovec
Args:
other: iovec
off: iovec iovec byte
bytes: iovec byte
)");
#define PY_METHOD_AT(method, ...) return unwrap(self.method##at(dir_fd.value_or(AT_FDCWD), __VA_ARGS__));
#define PY_F_METHOD_AT(method, ...) \
unwrap(self.method##at(dir_fd.value_or(AT_FDCWD), __VA_ARGS__, follow_symlinks ? 0 : AT_SYMLINK_NOFOLLOW))
py::class_<hl::IClient, std::shared_ptr<hl::IClient>>(m, "Client")
.def(py::init([](std::string_view mountName, std::string_view token, bool as_super) {
if (as_super) {
return unwrap(hl::IClient::newSuperClient(mountName, token));
} else {
return unwrap(hl::IClient::newClient(mountName, token));
}
}),
py::arg("mount_name"),
py::arg("token"),
py::kw_only(),
py::arg("as_super") = false,
R"(
client
Args:
mount_name:
token: 访 3FS
as_super: super client token root
)")
.def(
"opendir",
[](hl::IClient &self, std::string name, std::optional<int> dir_fd) { PY_METHOD_AT(opendir, name); },
py::arg("name"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt)
.def("readdir", [](hl::IClient &self, const std::shared_ptr<hl::DIR> &dp) { return unwrap(self.readdir(dp)); })
.def("rewinddir",
[](hl::IClient &self, const std::shared_ptr<hl::DIR> &dp) { return unwrap(self.rewinddir(dp)); })
.def(
"mkdir",
[](hl::IClient &self, std::string path, mode_t mode, std::optional<int> dir_fd, bool recursive) {
PY_METHOD_AT(mkdir, path, mode, recursive);
},
py::arg("path"),
py::arg("mode") = 0777,
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("recursive") = false)
.def(
"rmdir",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd, bool recursive) {
PY_METHOD_AT(rmdir, path, recursive);
},
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("recursive") = false)
.def(
"unlink",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd) { PY_METHOD_AT(unlink, path, 0); },
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt)
.def(
"remove",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd, bool recursive) {
PY_METHOD_AT(remove, path, recursive);
},
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("recursive") = false)
.def(
"rename",
[](hl::IClient &self,
std::string src,
std::string dst,
std::optional<int> src_dir_fd,
std::optional<int> dst_dir_fd) {
unwrap(self.renameat(src_dir_fd.value_or(AT_FDCWD), src, dst_dir_fd.value_or(AT_FDCWD), dst));
},
py::arg("src"),
py::arg("dst"),
py::kw_only(),
py::arg("src_dir_fd") = std::nullopt,
py::arg("dst_dir_fd") = std::nullopt)
.def(
"creat",
[](hl::IClient &self, std::string path, mode_t mode, std::optional<int> dir_fd, bool excl) {
PY_METHOD_AT(creat, path, mode, excl);
},
py::arg("path"),
py::arg("mode") = 0666,
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("excl") = false)
.def(
"symlink",
[](hl::IClient &self, std::string src, std::string dst, std::optional<int> dir_fd) {
unwrap(self.symlinkat(src, dir_fd.value_or(AT_FDCWD), dst));
},
py::arg("src"),
py::arg("dst"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt)
.def(
"link",
[](hl::IClient &self,
std::string src,
std::string dst,
std::optional<int> src_dir_fd,
std::optional<int> dst_dir_fd,
bool follow_symlinks) {
unwrap(self.linkat(src_dir_fd.value_or(AT_FDCWD),
src,
dst_dir_fd.value_or(AT_FDCWD),
dst,
follow_symlinks ? AT_SYMLINK_NOFOLLOW : 0));
},
py::arg("src"),
py::arg("dst"),
py::kw_only(),
py::arg("old_dir_fd") = std::nullopt,
py::arg("new_dir_fd") = std::nullopt,
py::arg("follow_symlinks") = true)
.def(
"stat",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd, bool follow_symlinks) {
hl::stat st;
PY_F_METHOD_AT(fstat, path, &st);
return st;
},
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("follow_symlinks") = true)
.def("fstat",
[](hl::IClient &self, int fd) {
hl::stat st;
unwrap(self.fstat(fd, &st));
return st;
})
.def(
"readlink",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd) { PY_METHOD_AT(readlink, path); },
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt)
.def(
"realpath",
[](hl::IClient &self, std::string path, std::optional<int> dir_fd, bool absolute) {
PY_METHOD_AT(realpath, path, absolute);
},
py::arg("path"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("absolute") = false)
.def(
"open",
[](hl::IClient &self, std::string path, int flags, mode_t mode, std::optional<int> dir_fd) {
PY_METHOD_AT(open, path, flags, mode);
},
py::arg("path"),
py::arg("flags"),
py::arg("mode") = 0666,
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
R"(
使 client 3fs fd fd
使 hf3fs.fuse.serverPath
)")
.def("close", [](hl::IClient &self, int fd) { return unwrap(self.close(fd)); })
.def("ftruncate", [](hl::IClient &self, int fd, off_t length) { return unwrap(self.ftruncate(fd, length)); })
.def(
"utime",
[](hl::IClient &self,
std::string path,
std::optional<std::pair<float, float>> times,
std::optional<std::pair<int64_t, int64_t>> ns,
std::optional<int> dir_fd,
bool follow_symlinks) {
struct timespec ts[2];
if (times.has_value() && ns.has_value()) {
throw std::invalid_argument("times and ns are both specified");
} else if (times.has_value()) {
ts[0].tv_sec = (time_t)times->first;
ts[0].tv_nsec = (long)times->first * 1'000'000'000;
ts[1].tv_sec = (time_t)times->second;
ts[1].tv_nsec = (long)times->second * 1'000'000'000;
} else if (ns.has_value()) {
ts[0].tv_sec = ns->first / 1'000'000'000;
ts[0].tv_nsec = ns->first % 1'000'000'000;
ts[1].tv_sec = ns->second / 1'000'000'000;
ts[1].tv_nsec = ns->second % 1'000'000'000;
} else {
ts[0].tv_nsec = UTIME_NOW;
ts[1].tv_nsec = UTIME_NOW;
}
PY_F_METHOD_AT(utimens, path, ts);
},
py::arg("path"),
py::kw_only(),
py::arg("times") = std::nullopt,
py::arg("ns") = std::nullopt,
py::arg("dir_fd") = std::nullopt,
py::arg("follow_symlinks") = true)
.def(
"chmod",
[](hl::IClient &self, std::string path, mode_t mode, std::optional<int> dir_fd, bool follow_symlinks) {
PY_F_METHOD_AT(fchmod, path, mode);
},
py::arg("path"),
py::arg("mode"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("follow_symlinks") = true)
.def(
"chown",
[](hl::IClient &self, std::string path, int uid, int gid, std::optional<int> dir_fd, bool follow_symlinks) {
PY_F_METHOD_AT(fchown, path, uid, gid);
},
py::arg("path"),
py::arg("uid"),
py::arg("gid"),
py::kw_only(),
py::arg("dir_fd") = std::nullopt,
py::arg("follow_symlinks") = true)
.def("chdir", [](hl::IClient &self, std::string path) { unwrap(self.chdir(path)); })
.def(
"iovalloc",
[](hl::IClient &self, size_t bytes, int numa, bool global, size_t block_size) {
return unwrap(self.iovalloc(bytes, numa, global, block_size));
},
py::arg("bytes"),
py::kw_only(),
py::arg("numa") = -1,
py::arg("global") = false,
py::arg("block_size") = 0,
R"(
iovec numa
Args:
bytes: iovec byte
numa: iovec numa numa
global: 访iovecshare给其它进程false
Examples:
.. code-block:: python
client.iovalloc(1 << 30)
client.iovalloc(1 << 30, numa=1)
)")
.def("iovfree", [](hl::IClient &self, hl::iovec iov) { unwrap(self.iovfree(iov.iov_handle)); })
.def(
"preadv",
[](hl::IClient &self, const std::vector<py_iovec> &piov) {
std::vector<hl::iovec> iov(piov.size());
std::vector<hl::ioseg> segv(piov.size());
std::vector<ssize_t> resv(piov.size());
for (size_t i = 0; i < piov.size(); ++i) {
iov[i] = std::get<0>(piov[i]);
segv[i] = hl::ioseg{std::get<1>(piov[i]), std::get<2>(piov[i])};
}
unwrap(self.preadv((int)piov.size(), iov.data(), segv.data(), resv.data()));
return resv;
},
R"(
iovec
Args:
piov: (iovec, fd, offset)
.. code-block:: python
import hf3fs as h3
iov = client.iovalloc(1 << 30)
client.preadv([(h3.iovec(iov, 3072, 2048), 3, 1024)])
# 从 3 号 fd 中的文件,从 1024 字节开始,读取 2048 字节到 iov 上从 3072 字节开始的内存
)")
.def(
"pwritev",
[](hl::IClient &self, const std::vector<py_iovec> &piov) {
std::vector<hl::iovec> iov(piov.size());
std::vector<hl::ioseg> segv(piov.size());
std::vector<ssize_t> resv(piov.size());
for (size_t i = 0; i < piov.size(); ++i) {
iov[i] = std::get<0>(piov[i]);
segv[i] = hl::ioseg{std::get<1>(piov[i]), std::get<2>(piov[i])};
}
unwrap(self.pwritev((int)piov.size(), iov.data(), segv.data(), resv.data()));
return resv;
},
R"(
iovec
Args:
piov: (iovec, fd, offset)
.. code-block:: python
import hf3fs as h3
iov = client.iovalloc(1 << 30)
memoryview(h3.iovec(iov, 3072, 2048))[:] = bytes([1] * 2048)
client.pwrite([(h3.iovec(iov, 3072, 2048), 3, 1024)])
# 将从 iov 中3072 字节开始的连续 2048 个字节写入到 3 号 fd 中的文件 offset 为 1024 的位置
)")
.def(
"lseek",
[](hl::IClient &self, int fd, off_t pos, int how, std::optional<size_t> readahead) {
return unwrap(self.lseek(fd, pos, how, readahead.value_or(0)));
},
py::arg("fd"),
py::arg("pos"),
py::arg("how"),
py::kw_only(),
py::arg("readahead") = std::nullopt)
.def(
"read",
[](hl::IClient &self, int fd, py::buffer buf, std::optional<size_t> readahead) {
auto binfo = buf.request();
ssize_t stride = binfo.itemsize;
for (size_t i = binfo.ndim; i--;) {
if (stride != binfo.strides[i]) {
throw std::invalid_argument("cannot read into incontiguous buffer");
} else {
stride *= binfo.shape[i];
}
}
return unwrap(self.read(fd, binfo.ptr, stride, readahead.value_or(0)));
},
py::arg("fd"),
py::arg("buf"),
py::kw_only(),
py::arg("readahead") = std::nullopt)
.def(
"write",
[](hl::IClient &self, int fd, py::buffer buf, bool flush) {
auto binfo = buf.request();
ssize_t stride = binfo.itemsize;
for (size_t i = binfo.ndim; i--;) {
if (stride != binfo.strides[i]) {
throw std::invalid_argument("cannot write from incontiguous buffer");
} else {
stride *= binfo.shape[i];
}
}
return unwrap(self.write(fd, binfo.ptr, stride, flush));
},
py::arg("fd"),
py::arg("buf"),
py::kw_only(),
py::arg("flush") = false)
.def(
"sharedFileHandles",
[](hl::IClient &self, const std::vector<int> &fds) { return unwrap(self.sharedFileHandles(fds)); },
py::arg("fds"))
.def(
"openWithFileHandles",
[](hl::IClient &self, const std::vector<uint8_t> &fhs) { return unwrap(self.openWithFileHandles(fhs)); },
py::arg("fhs"))
.def(
"sharedIovecHandle",
[](hl::IClient &self, const hl::iovec &iov) { return unwrap(self.sharedIovecHandle(iov.iov_handle)); },
py::arg("iov"))
.def(
"openIovecHandle",
[](hl::IClient &self, const std::string &iovh) { return unwrap(self.openIovecHandle(iovh)); },
py::arg("iovh"));
// py::class_<hl::ClientPool>(m, "ClientPool")
// .def(py::init<int, std::string_view, uint32_t>())
// .def("acquire", &ClientPool::acquire)
// .def("release", &ClientPool::release);
#undef PY_F_METHOD_AT
#undef PY_METHOD_AT
}

View File

@@ -0,0 +1,470 @@
#include <chrono>
#include <ctime>
#include <fcntl.h>
#include <filesystem>
#include <fmt/format.h>
#include <pybind11/chrono.h>
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <pybind11/stl.h>
#include <stdexcept>
#include "common/utils/Uuid.h"
#include "lib/api/fuse.h"
#include "lib/api/hf3fs_usrbio.h"
namespace py = pybind11;
struct OSException {
int errcode;
};
struct Hf3fsIovWithRes : public hf3fs_iov {
std::shared_ptr<Hf3fsIovWithRes> base_iov;
ssize_t result = 0;
py::object userdata;
};
struct Hf3fsIorWithIovs : public hf3fs_ior {
std::vector<std::shared_ptr<Hf3fsIovWithRes>> iovs;
};
PYBIND11_MODULE(hf3fs_py_usrbio, m) {
py::register_exception_translator([](std::exception_ptr p) {
try {
if (p) std::rethrow_exception(p);
} catch (const OSException &e) {
errno = e.errcode;
PyErr_SetFromErrno(PyExc_OSError);
}
});
m.def(
"extract_mount_point",
[](const std::string &path) -> std::optional<std::string> {
char mp[4096];
auto res = hf3fs_extract_mount_point(mp, sizeof(mp), path.c_str());
if (res < 0) {
return std::nullopt;
} else if (res > (int)sizeof(mp)) {
throw OSException{ENAMETOOLONG};
} else {
return std::string(mp);
}
},
py::arg("path"),
R"(
hf3fs的挂载点iov和ior的创建
Args:
path: hf3fs上的文件路径
)")
.def(
"register_fd",
[](int fd, uint64_t flags) {
auto res = hf3fs_reg_fd(fd, flags);
if (res > 0) {
throw OSException{res};
}
},
py::arg("fd"),
py::arg("flags") = 0,
R"(
使 usrbio fd
fd
Args:
fd:
)")
.def("deregister_fd",
&hf3fs_dereg_fd,
py::arg("fd"),
R"(
使 usrbio fd
close
Args:
fd:
)");
m.def(
"force_fsync",
[](int fd) { return ioctl(fd, hf3fs::lib::fuse::HF3FS_IOC_FSYNC); },
py::arg("fd"),
R"(
使 stat
Args:
fd:
)");
m.def(
"hardlink",
[](const std::string &target, const std::string &link_name) {
int ret = hf3fs_hardlink(target.c_str(), link_name.c_str());
if (ret != 0) {
throw OSException{errno};
}
},
py::arg("target"),
py::arg("link_name"));
py::class_<Hf3fsIovWithRes, std::shared_ptr<struct Hf3fsIovWithRes>>(m,
"iovec",
py::buffer_protocol(),
R"(
usrbio Python SharedMemory buffer protocol numa
)")
.def_buffer([](const Hf3fsIovWithRes &iov) -> py::buffer_info {
return py::buffer_info(iov.base, 1, py::format_descriptor<uint8_t>::format(), 1, {iov.size}, {1});
})
.def(py::init([](py::buffer base,
std::string_view id,
const char *hf3fs_mount_point,
size_t block_size = 0,
int numa = -1) {
auto iov = std::make_shared<struct Hf3fsIovWithRes>();
hf3fs::Uuid uuid;
auto idRes = hf3fs::Uuid::fromHexString(id);
if (!idRes) {
throw std::invalid_argument(fmt::format("id '{}' is not a valid UUID ", *idRes));
}
uuid = *idRes;
auto bufInfo = base.request(true);
auto size = bufInfo.itemsize * (uint64_t)bufInfo.size;
auto ptr = (uint8_t *)bufInfo.ptr;
auto res = hf3fs_iovwrap(iov.get(), ptr, uuid.data, hf3fs_mount_point, size, block_size, numa);
if (res < 0) {
throw OSException{-res};
}
return iov;
}),
py::arg("base"),
py::arg("id"),
py::arg("hf3fs_mount_point"),
py::arg("block_size") = 0,
py::arg("numa") = -1)
.def(
"slice_by",
[](const std::shared_ptr<Hf3fsIovWithRes> &self, py::buffer buf) {
auto bufInfo = buf.request(true);
auto ptr = (uint8_t *)bufInfo.ptr;
auto bs = bufInfo.itemsize * (uint64_t)bufInfo.size;
if (ptr < self->base) {
throw std::out_of_range("given buf is out of range");
}
size_t off = (uint8_t *)ptr - (uint8_t *)self->base;
size_t bytes = bs;
if (off + bytes > self->size) {
throw std::out_of_range("given buf is out of range");
}
auto iov = std::make_shared<struct Hf3fsIovWithRes>();
iov->base = ptr;
iov->size = bs;
iov->block_size = self->block_size;
iov->numa = self->numa;
memcpy(iov->id, self->id, sizeof(self->id));
strcpy(iov->mount_point, self->mount_point);
iov->base_iov = self->base_iov ? self->base_iov : self;
return iov;
},
py::arg("buf"),
R"(
iovec buf
buf buf iovec
Args:
buf: Python buffer
)")
.def("__getitem__",
[](const std::shared_ptr<Hf3fsIovWithRes> &self, const py::slice &slice) {
size_t start = 0, stop = 0, step = 0, slicelength = 0;
if (!slice.compute(self->size, &start, &stop, &step, &slicelength)) {
throw py::error_already_set();
} else if (step != 1) {
throw std::invalid_argument("step not 1 when slicing iovec");
}
auto iov = std::make_shared<struct Hf3fsIovWithRes>();
iov->base = self->base + start;
iov->size = slicelength;
iov->block_size = self->block_size;
iov->numa = self->numa;
memcpy(iov->id, self->id, sizeof(self->id));
strcpy(iov->mount_point, self->mount_point);
iov->base_iov = self->base_iov ? self->base_iov : self;
// XLOGF(DBG,
// "self {} iov {} base {} base off {} size {}",
// (void *)self.get(),
// (void *)iov.get(),
// (void *)iov->base,
// iov->base_off,
// iov->size);
return iov;
})
.def_readonly("result", &Hf3fsIovWithRes::result, R"(
-errno
)")
.def_property_readonly("base_off",
[](const std::shared_ptr<Hf3fsIovWithRes> &self) {
return self->base_iov ? self->base - self->base_iov->base : 0;
})
.def_property_readonly(
"userdata",
[](const std::shared_ptr<Hf3fsIovWithRes> &self) { return self->userdata; },
R"(
userdata prepare
)");
py::class_<Hf3fsIorWithIovs, std::shared_ptr<Hf3fsIorWithIovs>>(m, "ioring", R"(
usrbio fuse
)")
.def(py::init([](const char *hf3fs_mount_point,
int entries,
bool for_read = true,
int io_depth = 0,
std::optional<int> priority = std::nullopt,
std::optional<int> timeout = std::nullopt,
int numa = -1,
int flags = 0) {
std::shared_ptr<Hf3fsIorWithIovs> ior(new Hf3fsIorWithIovs{}, [](auto p) {
hf3fs_iordestroy(p);
delete p;
});
(void)priority;
auto res = hf3fs_iorcreate4(ior.get(),
hf3fs_mount_point,
entries,
for_read,
io_depth,
timeout.value_or(0),
numa,
flags);
if (res < 0) {
throw OSException{-res};
}
ior->iovs.resize(hf3fs_io_entries(ior.get()));
return ior;
}),
py::arg("hf3fs_mount_point"),
py::arg("entries"),
py::arg("for_read") = true,
py::arg("io_depth") = 0,
py::arg("priority") = std::nullopt,
py::arg("timeout") = std::nullopt,
py::arg("numa") = -1,
py::arg("flags") = 0)
.def_static("size_for_entries", &hf3fs_ior_size)
.def_property_readonly("entries",
[](const std::shared_ptr<Hf3fsIorWithIovs> &self) { return hf3fs_io_entries(self.get()); })
.def(
"prepare",
[](const std::shared_ptr<Hf3fsIorWithIovs> &self,
const std::shared_ptr<Hf3fsIovWithRes> &iov,
bool read,
int fd,
size_t off,
const py::object &userdata) {
// XLOGF(DBG,
// "iov {} base {} base off {} size {}",
// (void *)iov.get(),
// (void *)iov->base,
// iov->base_off,
// iov->size);
userdata.inc_ref();
py::gil_scoped_release gr;
auto res = hf3fs_prep_io(self.get(),
(iov->base_iov ? iov->base_iov : iov).get(),
read,
iov->base,
fd,
off,
iov->size,
(void *)userdata.ptr());
if (res < 0) {
throw OSException{-res};
}
self->iovs[res] = iov;
return self;
},
py::arg("iov"),
py::arg("read"),
py::arg("fd"),
py::arg("off"),
py::arg("userdata") = py::none(),
R"(
ioring
prepare usrbio
Args:
iov: 使 iov slice
read: True / False ioring
fd:
off:
userdata: None Python object object
)")
.def(
"submit",
[](const std::shared_ptr<Hf3fsIorWithIovs> &self) {
py::gil_scoped_release gr;
auto res = hf3fs_submit_ios(self.get());
if (res < 0) {
throw OSException{-res};
}
return self;
},
R"(
ioring
)")
.def(
"wait",
[](const std::shared_ptr<Hf3fsIorWithIovs> &self,
int max_results = 0,
int min_results = 0,
std::optional<std::chrono::microseconds> timeout = std::nullopt) {
std::vector<std::shared_ptr<Hf3fsIovWithRes>> out;
{
py::gil_scoped_release gr;
if (max_results < min_results) {
max_results = min_results;
}
struct timespec start, ts;
struct timespec *tsp = nullptr;
auto res = clock_gettime(CLOCK_REALTIME, &start);
if (res < 0) {
auto err = errno;
throw OSException{-err};
}
if (timeout) {
ts = start;
XLOGF(DBG, "timeout {}us", timeout->count());
ts.tv_sec += (uint64_t)timeout->count() / 1000000;
ts.tv_nsec += (uint64_t)timeout->count() % 1000000 * 1000;
ts.tv_sec += ts.tv_nsec / 1000000000;
ts.tv_nsec %= 1000000000;
tsp = &ts;
}
std::vector<struct hf3fs_cqe> cqes(max_results > 0 ? max_results : min_results > 0 ? min_results : 1);
out.reserve(max_results > 0 ? max_results : min_results > 0 ? min_results : 1024);
do {
XLOGF(DBG, "to wait for ios");
// if we got enough results, wait without timeout
auto res = hf3fs_wait_for_ios(self.get(),
cqes.data(),
cqes.size(),
min_results - (int)out.size(),
(int)out.size() < min_results ? tsp : &start);
XLOGF(DBG, "waited for ios with res {}", res);
if (res < 0) {
if (out.empty()) {
throw OSException{-res};
} else {
// so the fetched results won't be wasted
return out;
}
}
for (int i = 0; i < res; ++i) {
auto idx = cqes[i].index;
XLOGF(DBG, "iov idx {}", idx);
auto &iov = self->iovs[idx];
if (!iov) {
XLOGF(FATAL, "same cqe {} fetched more than once", cqes[i].index);
}
iov->result = cqes[i].result;
iov->userdata = py::reinterpret_borrow<py::object>((PyObject *)cqes[i].userdata);
// iov->userdata.dec_ref();
out.emplace_back(std::move(iov));
iov.reset();
}
if (!res) {
break;
}
} while ((int)out.size() < max_results);
}
for (auto &iov : out) {
iov->userdata.dec_ref();
}
return out;
},
py::kw_only(),
py::arg("max_results") = 0,
py::arg("min_results") = 0,
py::arg("timeout") = std::nullopt,
R"(
ioring
Args:
max_results:
min_results:
timeout: None
)")
.def("destroy", [](const std::shared_ptr<Hf3fsIorWithIovs> &self) { hf3fs_iordestroy(self.get()); });
m.def(
"punch_hole",
[](const std::string &filename, const std::vector<size_t> &start, const std::vector<size_t> &end, size_t flags) {
if (start.size() != end.size() || start.size() > HF3FS_IOCTL_PUNCH_HOLE_MAX) {
throw std::runtime_error("size not equal");
}
int fd = open(filename.c_str(), O_RDWR);
auto res = hf3fs_punchhole(fd, start.size(), start.data(), end.data(), flags);
close(fd);
if (res != 0) {
throw OSException{res};
}
},
py::arg("filename"),
py::arg("start"),
py::arg("end"),
py::arg("flags") = 0);
m.def(
"punch_hole",
[](int fd, const std::vector<size_t> &start, const std::vector<size_t> &end, size_t flags) {
if (start.size() != end.size() || start.size() > HF3FS_IOCTL_PUNCH_HOLE_MAX) {
throw std::runtime_error("size not equal");
}
auto res = hf3fs_punchhole(fd, start.size(), start.data(), end.data(), flags);
if (res != 0) {
throw OSException{res};
}
},
py::arg("fd"),
py::arg("start"),
py::arg("end"),
py::arg("flags") = 0);
}