Initial commit

This commit is contained in:
dev
2025-02-27 21:53:53 +08:00
commit 815e55e4c0
1291 changed files with 185445 additions and 0 deletions

View File

@@ -0,0 +1 @@
target_add_lib(analytics common apache_arrow_static)

65
src/analytics/Common.h Normal file
View File

@@ -0,0 +1,65 @@
#pragma once
#include "common/serde/Serde.h"
#include "common/utils/StrongType.h"
namespace hf3fs::serde {
template <typename T>
using SerdeToReadableMemberMethodReturnType = std::invoke_result_t<decltype(&T::serdeToReadable), T>;
template <typename T>
using SerdeToMemberMethodReturnType = std::invoke_result_t<decltype(&T::serdeTo), T>;
template <typename T>
using SerdeToReadableReturnType = std::invoke_result_t<decltype(serde::SerdeMethod<T>::serdeToReadable), const T &>;
template <typename T>
using SerdeToReturnType = std::invoke_result_t<decltype(serde::SerdeMethod<T>::serdeTo), const T &>;
template <typename T>
concept ConvertibleToString = std::is_convertible_v<T, std::string>;
template <typename T>
concept WithReadableSerdeMemberMethod =
!StrongTyped<T> && !ConvertibleToString<T> && requires(const T &t, SerdeToReadableMemberMethodReturnType<T> s) {
{ t.serdeToReadable() } -> std::convertible_to<SerdeToReadableMemberMethodReturnType<T>>;
{ T::serdeFromReadable(s) } -> std::convertible_to<Result<T>>;
};
template <typename T>
concept WithReadableSerdeMethod = !StrongTyped<T> && !ConvertibleToString<T> && !WithReadableSerdeMemberMethod<T> &&
requires(const T &t, SerdeToReadableReturnType<T> s) {
{ serde::SerdeMethod<T>::serdeToReadable(t) } -> std::convertible_to<SerdeToReadableReturnType<T>>;
{ serde::SerdeMethod<T>::serdeFromReadable(s) } -> std::convertible_to<Result<T>>;
};
template <typename T>
concept WithSerdeMemberMethod =
!StrongTyped<T> && !ConvertibleToString<T> && !WithReadableSerdeMemberMethod<T> && !WithReadableSerdeMethod<T> &&
requires(const T &t, SerdeToMemberMethodReturnType<T> s) {
{ t.serdeTo() } -> std::convertible_to<SerdeToMemberMethodReturnType<T>>;
{ T::serdeFrom(s) } -> std::convertible_to<Result<T>>;
};
template <typename T>
concept WithSerdeMethod =
!StrongTyped<T> && !ConvertibleToString<T> && !WithReadableSerdeMemberMethod<T> && !WithReadableSerdeMethod<T> &&
!WithSerdeMemberMethod<T> && requires(const T &t, SerdeToReturnType<T> s) {
{ serde::SerdeMethod<T>::serdeTo(t) } -> std::convertible_to<SerdeToReturnType<T>>;
{ serde::SerdeMethod<T>::serdeFrom(s) } -> std::convertible_to<Result<T>>;
};
template <typename T>
concept SerdeTypeWithoutSpecializedSerdeMethod =
serde::SerdeType<T> && !WithReadableSerdeMemberMethod<T> && !WithReadableSerdeMethod<T> &&
!WithSerdeMemberMethod<T> && !WithSerdeMethod<T>;
} // namespace hf3fs::serde
namespace hf3fs::analytics {
const std::string kVariantValueIndexColumnSuffix = "ValIdx";
const std::string kResultErrorTypeColumnSuffix = "Error";
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,238 @@
#pragma once
#include <arrow/io/file.h>
#include <optional>
#include <parquet/stream_reader.h>
#include <type_traits>
#include <utility>
#include <variant>
#include "SerdeObjectVisitor.h"
#include "SerdeSchemaBuilder.h"
#include "common/serde/Serde.h"
#include "common/utils/Nameof.hpp"
#include "common/utils/StrongType.h"
#include "common/utils/TypeTraits.h"
namespace hf3fs::analytics {
template <serde::SerdeType SerdeType>
class SerdeObjectReader : public BaseObjectVisitor<SerdeObjectReader<SerdeType>> {
public:
SerdeObjectReader(parquet::StreamReader &&reader)
: reader_(std::move(reader)) {}
static std::shared_ptr<SerdeObjectReader> open(const Path path) {
// open file
auto openStream = arrow::io::ReadableFile::Open(path.string());
if (!openStream.ok()) {
XLOGF(ERR, "Failed to open file input stream: {}, error: {}", path.string(), openStream.status().message());
return nullptr;
}
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_ASSIGN_OR_THROW(infile, openStream);
try {
parquet::StreamReader streamReader{parquet::ParquetFileReader::Open(infile)};
return std::make_shared<SerdeObjectReader>(std::move(streamReader));
} catch (const std::exception &ex) {
XLOGF(ERR, "Failed to create stream reader: {}, error: {}", path.string(), ex.what());
return nullptr;
}
}
SerdeObjectReader &operator>>(SerdeType &v) {
eof_ = eof();
if (!bool(*this)) return *this;
try {
visit("", v);
reader_ >> parquet::EndRow;
} catch (const parquet::ParquetException &ex) {
XLOGF(CRITICAL, "Failed to read from parquet file, error: {}", ex.what());
isOk_ = false;
}
return *this;
}
operator bool() const { return ok() && !eof_; }
bool ok() const { return isOk_; }
bool eof() const { return reader_.eof(); }
size_t numRows() const { return reader_.num_rows(); }
public:
// default
template <typename T>
void visit(std::string_view k, T &v) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view k, T &v) {
reader_ >> v;
XLOGF(DBG3, "arithmetic visit({}): {}", k, v);
}
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view k, T &v) {
int32_t n;
reader_ >> n;
XLOGF(DBG3, "enum visit({}): {}", k, n);
auto result = magic_enum::enum_cast<T>(n);
if (result) {
v = *result;
} else {
XLOGF(CRITICAL, "Failed to parse enum {} from value: {}", nameof::nameof_short_type<T>(), n);
}
}
template <serde::ConvertibleToString T>
void visit(std::string_view k, T &v) {
reader_ >> v;
XLOGF(DBG3, "string visit({}): {}", k, v);
}
template <StrongTyped T>
void visit(std::string_view k, T &v) {
XLOGF(DBG3, "strongtyped visit({})", k);
BaseObjectVisitor<SerdeObjectReader>::visit(k, v);
}
template <serde::WithReadableSerdeMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "WithReadableSerdeMethod visit({})", k);
typename serde::SerdeToReadableReturnType<T> serialized;
visit(k, serialized);
auto result = serde::SerdeMethod<T>::serdeFromReadable(serialized);
if (result) {
val = *result;
} else {
XLOGF(CRITICAL, "Failed to parse {} from value: {}", nameof::nameof_short_type<T>(), serialized);
}
}
template <serde::WithSerdeMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "WithSerdeMethod visit({})", k);
typename serde::SerdeToReturnType<T> serialized;
visit(k, serialized);
auto result = serde::SerdeMethod<T>::serdeFrom(serialized);
if (result) {
val = *result;
} else {
XLOGF(CRITICAL, "Failed to parse {} from value: {}", nameof::nameof_short_type<T>(), serialized);
}
}
template <serde::WithReadableSerdeMemberMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "WithReadableSerdeMemberMethod visit({})", k);
serde::SerdeToReadableMemberMethodReturnType<T> serialized;
visit(k, serialized);
auto result = T::serdeFromReadable(serialized);
if (result) {
val = *result;
} else {
XLOGF(CRITICAL, "Failed to parse {} from value: {}", nameof::nameof_short_type<T>(), serialized);
}
}
template <serde::WithSerdeMemberMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "WithSerdeMemberMethod visit({})", k);
serde::SerdeToReadableMemberMethodReturnType<T> serialized;
visit(k, serialized);
auto result = T::serdeFromReadable(serialized);
if (result) {
val = *result;
} else {
XLOGF(CRITICAL, "Failed to parse {} from value: {}", nameof::nameof_short_type<T>(), serialized);
}
}
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "serdetype visit({})", k);
BaseObjectVisitor<SerdeObjectReader>::visit(k, val);
}
template <typename T>
requires is_specialization_of_v<T, folly::Expected>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "result visit({})", k);
std::string errorColumnName = std::string{k} + kResultErrorTypeColumnSuffix;
Status status(StatusCode::kOK);
typename T::value_type value;
visit<typename T::error_type>(errorColumnName, status);
visit<typename T::value_type>(k, value);
if (status.isOK()) {
val = std::move(value);
} else {
val = makeError(status);
}
}
template <typename T>
requires is_variant_v<T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "variant visit({})", k);
// get the index of value
std::string valIdxColumnName = std::string{k} + kVariantValueIndexColumnSuffix;
uint32_t valIdx = 0;
visit<uint32_t>(valIdxColumnName, valIdx);
// read and set the value
uint32_t altIdx = 0;
visitVariant(val, [&](std::string_view typeName, auto &&v) {
std::string altTypeName = std::string{k} + std::string{typeName};
std::remove_reference_t<decltype(v)> alt;
visit(altTypeName, alt);
if (altIdx == valIdx) val = std::move(alt);
altIdx++;
});
}
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view k, T &val) {
std::string str;
reader_ >> str;
XLOGF(DBG3, "container visit({}): {}", k, str);
auto result = serde::fromJsonString(val, str);
if (!result) {
XLOGF(CRITICAL, "Failed to parse {} from json string: {}", nameof::nameof_short_type<T>(), str);
}
}
template <typename T>
requires is_optional_v<T>
void visit(std::string_view k, T &val) {
std::string str;
reader_ >> str;
XLOGF(DBG3, "container visit({}): {}", k, str);
if (str.empty()) {
val = std::nullopt;
} else {
using ValueType = typename T::value_type;
val = ValueType();
auto result = serde::fromJsonString(*val, str);
if (!result) {
XLOGF(CRITICAL, "Failed to parse {} from json string: {}", nameof::nameof_short_type<ValueType>(), str);
}
}
}
private:
parquet::StreamReader reader_;
bool isOk_{true};
bool eof_{false};
};
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,129 @@
#pragma once
#include <string_view>
#include <type_traits>
#include <utility>
#include <variant>
#include "Common.h"
#include "common/serde/Serde.h"
#include "common/utils/Nameof.hpp"
#include "common/utils/StrongType.h"
#include "common/utils/TypeTraits.h"
namespace hf3fs::analytics {
class ObjectVisitor {
public:
// default
template <typename T>
void visit(std::string_view, T &) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view, T &) = delete;
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view, T &) = delete;
template <StrongTyped T>
void visit(std::string_view, T &) = delete;
template <serde::ConvertibleToString T>
void visit(std::string_view, T &) = delete;
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view, T &) = delete;
template <typename T>
requires is_variant_v<T>
void visit(std::string_view, T &) = delete;
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view, T &) = delete;
template <typename T>
requires is_optional_v<T>
void visit(std::string_view, T &) = delete;
};
template <size_t I = 0>
inline void visitVariant(auto &&t, auto &&func) {
using T = std::decay_t<decltype(t)>;
using S = std::variant_alternative_t<I, T>;
if (t.index() == I) {
func(nameof::nameof_short_type<S>(), std::get<I>(t));
} else {
func(nameof::nameof_short_type<S>(), std::variant_alternative_t<I, T>{});
}
if constexpr (I + 1 < std::variant_size_v<T>) {
visitVariant<I + 1>(t, func);
}
}
template <typename Derived>
class BaseObjectVisitor : public ObjectVisitor {
public:
template <typename T>
void visit(std::string_view k, T &) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view k, T &v) {
XLOGF(DBG3, "arithmetic visit({})", k);
static_cast<Derived *>(this)->template visit<T>(k, v);
}
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view k, T &) = delete;
template <serde::ConvertibleToString T>
void visit(std::string_view k, T &) = delete;
template <StrongTyped T>
void visit(std::string_view k, T &v) {
XLOGF(DBG3, "strongtyped visit({})", k);
static_cast<Derived *>(this)->template visit<typename T::UnderlyingType>(k, v);
}
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "serdetype visit({})", k);
refl::Helper::iterate<T>(
[&](auto type) { static_cast<Derived *>(this)->template visit(type.name, val.*type.getter); });
}
template <typename T>
requires is_variant_v<T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "variant visit({})", k);
visitVariant(val, [&](std::string_view typeName, auto &&v) {
std::string altTypeName = std::string{k} + std::string{typeName};
static_cast<Derived *>(this)->template visit(altTypeName, v);
});
}
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "container visit({})", k);
for (auto item : val) {
static_cast<Derived *>(this)->template visit(k, item);
}
}
template <typename T>
requires is_optional_v<T>
void visit(std::string_view k, T &val) {
XLOGF(DBG3, "optional visit({})", k);
using ValueType = typename T::value_type;
if (val.has_value()) {
static_cast<Derived *>(this)->template visit<ValueType>(k, *val);
}
}
};
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,241 @@
#pragma once
#include <arrow/io/file.h>
#include <parquet/stream_writer.h>
#include <type_traits>
#include <utility>
#include <variant>
#include "SerdeObjectVisitor.h"
#include "SerdeSchemaBuilder.h"
#include "common/serde/Serde.h"
#include "common/utils/Nameof.hpp"
#include "common/utils/StrongType.h"
#include "common/utils/UtcTime.h"
namespace hf3fs::analytics {
template <serde::SerdeType SerdeType>
class SerdeObjectWriter : public BaseObjectVisitor<SerdeObjectWriter<SerdeType>> {
public:
SerdeObjectWriter(parquet::StreamWriter &&writer)
: writer_(std::move(writer)),
createTime_(UtcClock::now()) {}
static std::shared_ptr<SerdeObjectWriter> open(const Path path,
const bool append = false,
const size_t maxRowGroupLength = 1'000'000,
const std::vector<parquet::SortingColumn> &sortedColumns = {}) {
// open file
auto openStream = arrow::io::FileOutputStream::Open(path.string(), append);
if (!openStream.ok()) {
XLOGF(ERR, "Failed to open file output stream: {}, error: {}", path.string(), openStream.status().message());
return nullptr;
}
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_ASSIGN_OR_THROW(outfile, openStream);
// generate schema
SerdeSchemaBuilder<SerdeType> schemaBuilder;
auto schemaNode = schemaBuilder.getSchema();
if (schemaNode == nullptr) {
return nullptr;
}
parquet::WriterProperties::Builder writerBuilder;
writerBuilder.set_sorting_columns(sortedColumns);
writerBuilder.max_row_group_length(maxRowGroupLength);
writerBuilder.data_page_version(parquet::ParquetDataPageVersion::V2);
// set global encoding and compression method
// writerBuilder.encoding(parquet::Encoding::DELTA_BINARY_PACKED);
writerBuilder.compression(parquet::Compression::ZSTD);
// set encoding for string columns
// for (int fieldIndex = 0; fieldIndex < schemaNode->field_count(); fieldIndex++) {
// auto fieldNode = schemaNode->field(fieldIndex);
// auto fieldType = fieldNode->logical_type();
// if (fieldType->is_string()) writerBuilder.encoding(fieldNode->name(),
// parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY);
// }
try {
auto fileWriter = parquet::ParquetFileWriter::Open(outfile, schemaNode, writerBuilder.build());
if (fileWriter == nullptr) {
XLOGF(ERR, "Failed to open file writer: {}", path.string());
return nullptr;
}
parquet::StreamWriter streamWriter(std::move(fileWriter));
return std::make_shared<SerdeObjectWriter>(std::move(streamWriter));
} catch (const std::exception &ex) {
XLOGF(ERR, "Failed to create stream writer: {}, error: {}", path.string(), ex.what());
return nullptr;
}
}
SerdeObjectWriter &operator<<(const SerdeType &v) {
if (!bool(*this)) return *this;
try {
visit("", v);
writer_ << parquet::EndRow;
} catch (const parquet::ParquetException &ex) {
XLOGF(CRITICAL, "Failed to write to parquet file, error: {}", ex.what());
isOk_ = false;
}
return *this;
}
void endRowGroup() { writer_.EndRowGroup(); }
UtcTime createTime() { return createTime_; }
operator bool() const { return ok(); }
bool ok() const { return isOk_; }
public:
// default
template <typename T>
void visit(std::string_view k, const T &v) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view k, const T &v) {
XLOGF(DBG3, "arithmetic visit({})", k);
writer_ << v;
}
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view k, const T &v) {
XLOGF(DBG3, "enum visit({})", k);
writer_ << (int32_t)v;
}
template <serde::ConvertibleToString T>
void visit(std::string_view k, const T &v) {
XLOGF(DBG3, "string visit({})", k);
writer_ << v;
}
template <StrongTyped T>
void visit(std::string_view k, const T &v) {
XLOGF(DBG3, "strongtyped visit({})", k);
visit<typename T::UnderlyingType>(k, v.toUnderType());
}
template <serde::WithReadableSerdeMethod T>
void visit(std::string_view k, const T &val) {
auto serialized = serde::SerdeMethod<T>::serdeToReadable(val);
XLOGF(DBG3,
"WithReadableSerdeMethod visit({}), serialized: {} {}",
k,
nameof::nameof_type<decltype(serialized)>(),
serialized);
visit<serde::SerdeToReadableReturnType<T>>(k, serialized);
}
template <serde::WithSerdeMethod T>
void visit(std::string_view k, const T &val) {
auto serialized = serde::SerdeMethod<T>::serdeTo(val);
XLOGF(DBG3,
"WithSerdeMethod visit({}), serialized: {} {}",
k,
nameof::nameof_type<decltype(serialized)>(),
serialized);
visit(k, serialized);
}
template <serde::WithReadableSerdeMemberMethod T>
void visit(std::string_view k, const T &v) {
auto serialized = v.serdeToReadable();
XLOGF(DBG3,
"WithReadableSerdeMemberMethod visit({}), serialized: {} {}",
k,
nameof::nameof_type<decltype(serialized)>(),
serialized);
visit(k, serialized);
}
template <serde::WithSerdeMemberMethod T>
void visit(std::string_view k, const T &v) {
auto serialized = v.serdeTo();
XLOGF(DBG3,
"WithSerdeMemberMethod visit({}), serialized: {} {}",
k,
nameof::nameof_type<decltype(serialized)>(),
serialized);
visit(k, serialized);
}
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view k, const T &val) {
XLOGF(DBG3, "serdetype visit({})", k);
BaseObjectVisitor<SerdeObjectWriter>::visit(k, const_cast<T &>(val));
}
template <typename T>
requires is_specialization_of_v<T, folly::Expected>
void visit(std::string_view k, const T &val) {
XLOGF(DBG3, "result visit({})", k);
std::string errorColumnName = std::string{k} + kResultErrorTypeColumnSuffix;
if (val.hasValue()) {
Status ok(StatusCode::kOK);
visit<typename T::error_type>(errorColumnName, ok);
visit<typename T::value_type>(k, val.value());
} else {
typename T::value_type value{};
visit<typename T::error_type>(errorColumnName, val.error());
visit<typename T::value_type>(k, value);
}
}
template <typename T>
requires is_variant_v<T>
void visit(std::string_view k, const T &val) {
XLOGF(DBG3, "variant visit({})", k);
std::string valIdxColumnName = std::string{k} + kVariantValueIndexColumnSuffix;
visit<uint32_t>(valIdxColumnName, val.index());
BaseObjectVisitor<SerdeObjectWriter>::visit(k, const_cast<T &>(val));
}
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view k, const T &val) {
XLOGF(DBG3, "container visit({})", k);
auto str = serde::toJsonString(val);
writer_ << str;
}
template <typename T>
requires is_optional_v<T>
void visit(std::string_view k, const T &val) {
XLOGF(DBG3, "optional visit({})", k);
if (!val.has_value()) {
writer_ << "";
} else {
auto str = serde::toJsonString(*val);
writer_ << str;
}
}
private:
parquet::StreamWriter writer_;
UtcTime createTime_;
bool isOk_{true};
};
template <typename T>
SerdeObjectWriter<T> &operator<<(SerdeObjectWriter<T> &writer, parquet::EndRowGroupType) {
writer.endRowGroup();
return writer;
}
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,219 @@
#pragma once
#include <folly/logging/xlog.h>
#include <numeric>
#include <parquet/exception.h>
#include <parquet/schema.h>
#include "SerdeStructVisitor.h"
#include "common/serde/Serde.h"
#include "common/utils/Result.h"
#include "common/utils/TypeTraits.h"
namespace hf3fs::analytics {
template <serde::SerdeType SerdeType>
class SerdeSchemaBuilder : public BaseStructVisitor<SerdeSchemaBuilder<SerdeType>> {
public:
std::shared_ptr<parquet::schema::GroupNode> getSchema() {
try {
fields_.clear();
fieldNameParts_.clear();
this->visit<SerdeType>("");
return std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields_));
} catch (const parquet::ParquetException &ex) {
XLOGF(CRITICAL, "Failed to build schema of type {}, error: {}", nameof::nameof_full_type<SerdeType>(), ex.what());
return nullptr;
}
}
public:
// default
template <typename T>
void visit(std::string_view k) = delete;
template <>
void visit<bool>(std::string_view k) {
XLOGF(DBG3, "bool visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::None(),
parquet::Type::BOOLEAN));
}
template <>
void visit<int16_t>(std::string_view k) {
XLOGF(DBG3, "int16_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(16, true),
parquet::Type::INT32));
}
template <>
void visit<uint16_t>(std::string_view k) {
XLOGF(DBG3, "uint16_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(16, false),
parquet::Type::INT32));
}
template <>
void visit<int32_t>(std::string_view k) {
XLOGF(DBG3, "int32_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(32, true),
parquet::Type::INT32));
}
template <>
void visit<uint32_t>(std::string_view k) {
XLOGF(DBG3, "uint32_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(32, false),
parquet::Type::INT32));
}
template <>
void visit<int64_t>(std::string_view k) {
XLOGF(DBG3, "int64_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(64, true),
parquet::Type::INT64));
}
template <>
void visit<uint64_t>(std::string_view k) {
XLOGF(DBG3, "uint64_t visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(64, false),
parquet::Type::INT64));
}
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "enum visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::Int(32, true),
parquet::Type::INT32));
}
template <serde::ConvertibleToString T>
void visit(std::string_view k) {
XLOGF(DBG3, "string visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::String(),
parquet::Type::BYTE_ARRAY));
}
template <StrongTyped T>
void visit(std::string_view k) {
XLOGF(DBG3, "strongtyped visit({}), fullname: '{}'", k, getFieldFullName(k));
BaseStructVisitor<SerdeSchemaBuilder>::template visit<T>(k);
}
template <serde::WithReadableSerdeMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "WithReadableSerdeMethod visit({})", k);
visit<serde::SerdeToReadableReturnType<T>>(k);
}
template <serde::WithSerdeMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "WithSerdeMethod visit({})", k);
visit<serde::SerdeToReturnType<T>>(k);
}
template <serde::WithReadableSerdeMemberMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "WithReadableSerdeMemberMethod visit({})", k);
visit<serde::SerdeToReadableMemberMethodReturnType<T>>(k);
}
template <serde::WithSerdeMemberMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "WithSerdeMemberMethod visit({})", k);
visit<serde::SerdeToMemberMethodReturnType<T>>(k);
}
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "serdetype visit({}), fullname: '{}'", k, getFieldFullName(k));
if (!k.empty()) fieldNameParts_.push_back(filterOutInvalidChars(k));
BaseStructVisitor<SerdeSchemaBuilder>::template visit<T>(k);
if (!k.empty()) fieldNameParts_.pop_back();
}
template <typename T>
requires is_specialization_of_v<T, folly::Expected>
void visit(std::string_view k) {
XLOGF(DBG3, "result visit({}), fullname: '{}'", k, getFieldFullName(k));
std::string errorColumnName = std::string{k} + kResultErrorTypeColumnSuffix;
visit<typename T::error_type>(errorColumnName);
visit<typename T::value_type>(k);
}
template <typename T>
requires is_variant_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "variant visit({}), fullname: '{}'", k, getFieldFullName(k));
std::string valIdxColumnName = std::string{k} + kVariantValueIndexColumnSuffix;
visit<uint32_t>(valIdxColumnName);
BaseStructVisitor<SerdeSchemaBuilder>::template visit<T>(k);
}
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "container visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::String(),
parquet::Type::BYTE_ARRAY));
}
template <typename T>
requires is_optional_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "container visit({}), fullname: '{}'", k, getFieldFullName(k));
fields_.push_back(parquet::schema::PrimitiveNode::Make(getFieldFullName(k),
parquet::Repetition::REQUIRED,
parquet::LogicalType::String(),
parquet::Type::BYTE_ARRAY));
}
private:
std::string getFieldFullName(std::string_view k) {
fieldNameParts_.push_back(filterOutInvalidChars(k));
auto fieldFullName = std::accumulate(fieldNameParts_.begin(),
fieldNameParts_.end(),
std::string{},
[](const std::string &a, const std::string &b) {
return a + (a.empty() ? std::string{} : std::string("_")) + b;
});
fieldNameParts_.pop_back();
return fieldFullName;
}
std::string filterOutInvalidChars(std::string_view k) {
return std::accumulate(k.begin(), k.end(), std::string{}, [](const std::string &a, const char b) {
if (('a' <= b && b <= 'z') || ('A' <= b && b <= 'Z') || ('0' <= b && b <= '9')) return a + b;
return a;
});
};
private:
parquet::schema::NodeVector fields_;
std::vector<std::string> fieldNameParts_;
};
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,123 @@
#pragma once
#include <folly/logging/xlog.h>
#include <string_view>
#include <type_traits>
#include <utility>
#include <variant>
#include "Common.h"
#include "common/serde/Serde.h"
#include "common/utils/Nameof.hpp"
#include "common/utils/StrongType.h"
#include "common/utils/TypeTraits.h"
namespace hf3fs::analytics {
class StructVisitor {
public:
// default
template <typename T>
void visit(std::string_view) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view) = delete;
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view) = delete;
template <StrongTyped T>
void visit(std::string_view) = delete;
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view) = delete;
template <typename T>
requires is_variant_v<T>
void visit(std::string_view) = delete;
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view) = delete;
template <typename T>
requires is_optional_v<T>
void visit(std::string_view) = delete;
};
template <class T, size_t I = 0>
inline void visitVariant(auto &&func) {
using S = std::variant_alternative_t<I, T>;
func(nameof::nameof_short_type<S>(), std::type_identity<S>{});
if constexpr (I + 1 < std::variant_size_v<T>) {
visitVariant<T, I + 1>(func);
}
}
template <typename Derived>
class BaseStructVisitor : public StructVisitor {
public:
// default
template <typename T>
void visit(std::string_view k) = delete;
template <typename T>
requires std::is_arithmetic_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "arithmetic visit({})", k);
static_cast<Derived *>(this)->template visit<T>(k);
}
template <typename T>
requires std::is_enum_v<T>
void visit(std::string_view k) = delete;
template <serde::ConvertibleToString T>
void visit(std::string_view k) = delete;
template <StrongTyped T>
void visit(std::string_view k) {
XLOGF(DBG3, "strongtyped visit({})", k);
static_cast<Derived *>(this)->template visit<typename T::UnderlyingType>(k);
}
template <serde::SerdeTypeWithoutSpecializedSerdeMethod T>
void visit(std::string_view k) {
XLOGF(DBG3, "serdetype visit({})", k);
refl::Helper::iterate<T>([&](auto field) {
using FieldType = std::decay_t<decltype(std::declval<T>().*field.getter)>;
static_cast<Derived *>(this)->template visit<FieldType>(field.name);
});
}
template <typename T>
requires is_variant_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "variant visit({})", k);
visitVariant<T>([&](std::string_view typeName, auto &&v) {
using AlternativeType = typename std::decay_t<decltype(v)>::type;
std::string altTypeName = std::string{k} + std::string{typeName};
static_cast<Derived *>(this)->template visit<AlternativeType>(altTypeName);
});
}
template <typename T>
requires is_vector_v<T> || is_set_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "container visit({})", k);
using ElemValueType = typename T::value_type;
static_cast<Derived *>(this)->template visit<ElemValueType>(k);
}
template <typename T>
requires is_optional_v<T>
void visit(std::string_view k) {
XLOGF(DBG3, "optional visit({})", k);
using ValueType = typename T::value_type;
static_cast<Derived *>(this)->template visit<ValueType>(k);
}
};
} // namespace hf3fs::analytics

View File

@@ -0,0 +1,285 @@
#pragma once
#include <folly/Random.h>
#include <folly/concurrency/UnboundedQueue.h>
#include <future>
#include "SerdeObjectWriter.h"
#include "common/monitor/Recorder.h"
#include "common/monitor/ScopedMetricsWriter.h"
#include "common/utils/ConfigBase.h"
#include "common/utils/Path.h"
#include "common/utils/SysResource.h"
#include "common/utils/UtcTime.h"
namespace hf3fs::analytics {
template <serde::SerdeType SerdeType>
class StructuredTraceLog : public folly::MoveOnly {
struct TraceMeta {
SERDE_STRUCT_FIELD(timestamp, std::time_t{});
SERDE_STRUCT_FIELD(hostname, String{});
};
struct StructuredTrace {
SERDE_STRUCT_FIELD(trace_meta, TraceMeta{});
SERDE_STRUCT_FIELD(_, SerdeType{});
};
using WriterType = SerdeObjectWriter<StructuredTrace>;
using WriterPtr = std::shared_ptr<WriterType>;
public:
class Config : public hf3fs::ConfigBase<Config> {
public:
CONFIG_ITEM(trace_file_dir, Path{"."});
#ifndef NDEBUG
CONFIG_HOT_UPDATED_ITEM(enabled, false);
CONFIG_HOT_UPDATED_ITEM(dump_interval, 60_min);
#else
CONFIG_HOT_UPDATED_ITEM(enabled, true);
CONFIG_HOT_UPDATED_ITEM(dump_interval, 30_s);
#endif
CONFIG_HOT_UPDATED_ITEM(max_num_writers, size_t{1}, ConfigCheckers::checkPositive);
CONFIG_HOT_UPDATED_ITEM(max_row_group_length, size_t{100'000});
};
public:
StructuredTraceLog(const Config &config)
: config_(config),
enabled_(config.enabled()),
typename_(nameof::nameof_short_type<SerdeType>()),
hostname_(SysResource::hostname().value_or("unknown_host")),
latencyTagSet_({{"tag", typename_}, {"instance", fmt::to_string(fmt::ptr(this))}}),
createLatency_("trace_log.create_latency", latencyTagSet_),
appendLatency_("trace_log.append_latency", latencyTagSet_),
flushLatency_("trace_log.flush_latency", latencyTagSet_),
maxNumWriters_(config.max_num_writers()) {
onConfigUpdated_ = config_.addCallbackGuard([this]() {
bool enabled = config_.enabled();
if (enabled_ != enabled) {
enableTraceLog(enabled);
if (!enabled) flush(false /*async*/);
}
if (maxNumWriters_ != config_.max_num_writers()) {
updateMaxNumWriters(config_.max_num_writers());
}
});
uint64_t secsUntilFirstDump =
folly::Random::rand64(config_.dump_interval().asSec().count() / 2, config_.dump_interval().asSec().count());
nextDumpTime_ = microsecondsSinceEpoch(UtcClock::now() + std::chrono::seconds{secsUntilFirstDump});
}
~StructuredTraceLog() { close(); }
bool open() {
auto writer = getOrCreateWriter();
if (!writer) return false;
writerPool_.enqueue(writer);
return true;
}
std::shared_ptr<SerdeType> newEntry(const SerdeType &init = SerdeType{}) {
auto ptr = new SerdeType(init);
return std::shared_ptr<SerdeType>(ptr, [this](SerdeType *ptr) {
this->append(*ptr);
delete ptr;
});
}
void append(const SerdeType &msg) {
if (!enabled_) return;
{
monitor::ScopedLatencyWriter appendLatency(appendLatency_);
StructuredTrace trace{
.trace_meta = TraceMeta{.timestamp = UtcClock::secondsSinceEpoch(), .hostname = hostname_},
._ = msg,
};
WriterPtr writer = getOrCreateWriter();
if (UNLIKELY(writer == nullptr)) {
XLOGF(CRITICAL, "Cannot get a writer of {} trace log in directory {}", typename_, config_.trace_file_dir());
enableTraceLog(false);
return;
}
*writer << trace;
auto writerOk = writer->ok();
writerPool_.enqueue(std::move(writer));
if (UNLIKELY(!writerOk)) enableTraceLog(false);
}
auto currentTime = microsecondsSinceEpoch(UtcClock::now());
if (UNLIKELY(currentTime >= nextDumpTime_)) {
nextDumpTime_ = currentTime + config_.dump_interval().asUs().count();
flush(true /*async*/);
}
}
void flush(bool async, bool shutdown = false) {
auto running = dumpingTrace_.test_and_set();
if (running) return;
monitor::ScopedLatencyWriter flushLatency(flushLatency_);
if (asyncFlush_.valid()) asyncFlush_.wait();
asyncFlush_ = std::async(
std::launch::async,
[this](bool shutdown) {
size_t numWritersToClose = numWriters_.load();
auto now = UtcClock::now();
XLOGF(INFO,
"Flushing {} {} log writers in directory {}",
numWritersToClose,
typename_,
config_.trace_file_dir());
for (size_t i = 0; numWritersToClose > 0; i++) {
// give up flushing old writers after trying for too many loops
if (i >= 10 * maxNumWriters_) {
break;
}
auto writer = writerPool_.dequeue();
if (!writer) continue;
if (writer->createTime() > now) {
writerPool_.enqueue(writer);
continue;
}
if (writer->ok()) {
// add an empty trace at the end of log
*writer << StructuredTrace{
.trace_meta = {.timestamp = UtcClock::secondsSinceEpoch(), .hostname = hostname_}};
}
try {
writer.reset();
} catch (const std::exception &ex) {
XLOGF(ERR,
"Failed to close {} log writer in directory {}, error: {}",
typename_,
config_.trace_file_dir(),
ex.what());
}
if (shutdown)
numWriters_--;
else
writerPool_.enqueue(createNewWriter());
numWritersToClose--;
}
if (numWritersToClose > 0) {
XLOGF(WARN,
"Still have {} {} log writers not closed in directory {}",
numWritersToClose,
typename_,
config_.trace_file_dir());
} else {
XLOGF(INFO, "Flushed {} trace log in directory {}", typename_, config_.trace_file_dir());
}
},
shutdown);
if (!async) asyncFlush_.wait();
dumpingTrace_.clear();
}
void close() {
enableTraceLog(false);
flush(false /*async*/, true /*shutdown*/);
XLOGF(INFO, "Closed {} trace log in directory {}", typename_, config_.trace_file_dir());
}
private:
uint64_t microsecondsSinceEpoch(const UtcTime &time) const {
return std::chrono::duration_cast<std::chrono::microseconds>((time).time_since_epoch()).count();
}
WriterPtr getOrCreateWriter() {
WriterPtr writer;
if (writerPool_.try_dequeue(writer)) return writer;
auto currentNumWriters = numWriters_.load();
if (currentNumWriters < maxNumWriters_) {
bool create = numWriters_.compare_exchange_strong(currentNumWriters, currentNumWriters + 1);
if (create) return createNewWriter();
}
return writerPool_.dequeue();
}
WriterPtr createNewWriter() {
monitor::ScopedLatencyWriter createLatency(createLatency_);
auto timestamp = fmt::localtime(UtcClock::to_time_t(UtcClock::now()));
Path logfilePath =
config_.trace_file_dir() / Path{fmt::format("{:%Y-%m-%d}", timestamp)} / Path{hostname_} /
Path{
fmt::format("{}.{}.{:%Y-%m-%d-%H-%M-%S}.{}.parquet", typename_, hostname_, timestamp, nextLogFileIndex_++)};
if (!boost::filesystem::exists(logfilePath.parent_path())) {
boost::system::error_code err{};
boost::filesystem::create_directories(logfilePath.parent_path(), err);
if (UNLIKELY(err.failed())) {
XLOGF(CRITICAL, "Failed to create directory {}, error: {}", logfilePath.parent_path(), err.message());
return nullptr;
}
}
XLOGF(INFO, "Opening {} trace log: {}", typename_, logfilePath);
return WriterType::open(logfilePath, false /*append*/, config_.max_row_group_length());
}
void enableTraceLog(bool enable) {
enabled_ = enable;
XLOGF(INFO,
"{} {} trace log in directory {}",
enable ? "Enabled" : "Disabled",
typename_,
config_.trace_file_dir());
}
void updateMaxNumWriters(size_t newMaxNumWriters) {
XLOGF(INFO,
"Update max num of writers from {} to {} for {} trace log in directory {}",
maxNumWriters_.load(),
newMaxNumWriters,
typename_,
config_.trace_file_dir());
bool doFlush = maxNumWriters_ > newMaxNumWriters;
maxNumWriters_ = newMaxNumWriters;
if (doFlush) flush(false /*async*/);
}
private:
const Config &config_;
bool enabled_ = false;
const std::string typename_;
const std::string hostname_;
const monitor::TagSet latencyTagSet_;
monitor::LatencyRecorder createLatency_;
monitor::LatencyRecorder appendLatency_;
monitor::LatencyRecorder flushLatency_;
std::unique_ptr<ConfigCallbackGuard> onConfigUpdated_;
std::atomic_size_t maxNumWriters_;
std::atomic_size_t numWriters_ = 0;
std::atomic_size_t nextLogFileIndex_ = 1;
folly::UnboundedQueue<WriterPtr, false, false, true> writerPool_;
std::atomic_uint64_t nextDumpTime_;
std::atomic_flag dumpingTrace_;
std::future<void> asyncFlush_;
};
} // namespace hf3fs::analytics