reformat code with --line-length=150 (#18)

This commit is contained in:
Runji Wang
2025-03-05 22:46:23 +08:00
committed by GitHub
parent ed112db42a
commit 52ecc5e455
48 changed files with 794 additions and 2604 deletions

View File

@@ -80,9 +80,7 @@ def check_data(actual: bytes, expected: bytes, offset: int) -> None:
)
expected = expected[index : index + 16]
actual = actual[index : index + 16]
raise ValueError(
f"Data mismatch at offset {offset + index}.\nexpect: {expected}\nactual: {actual}"
)
raise ValueError(f"Data mismatch at offset {offset + index}.\nexpect: {expected}\nactual: {actual}")
def generate_data(offset: int, length: int) -> bytes:
@@ -92,16 +90,10 @@ def generate_data(offset: int, length: int) -> bytes:
"""
istart = offset // 4
iend = (offset + length + 3) // 4
return (
np.arange(istart, iend)
.astype(np.uint32)
.tobytes()[offset % 4 : offset % 4 + length]
)
return np.arange(istart, iend).astype(np.uint32).tobytes()[offset % 4 : offset % 4 + length]
def iter_io_slice(
offset: int, length: int, block_size: Union[int, Tuple[int, int]]
) -> Iterator[Tuple[int, int]]:
def iter_io_slice(offset: int, length: int, block_size: Union[int, Tuple[int, int]]) -> Iterator[Tuple[int, int]]:
"""
Generate the IO (offset, size) for the slice [offset, offset + length) with the given block size.
`block_size` can be an integer or a range [start, end]. If a range is provided, the IO size will be randomly selected from the range.
@@ -161,9 +153,7 @@ def fstest(
if output_path is not None:
os.makedirs(output_path, exist_ok=True)
df = sp.from_items(
[{"path": os.path.join(output_path, f"{i}")} for i in range(npartitions)]
)
df = sp.from_items([{"path": os.path.join(output_path, f"{i}")} for i in range(npartitions)])
df = df.repartition(npartitions, by_rows=True)
stats = df.map(lambda x: fswrite(x["path"], size, blocksize)).to_pandas()
logging.info(f"write stats:\n{stats}")
@@ -187,18 +177,14 @@ if __name__ == "__main__":
python example/fstest.py -o 'fstest' -j 8 -s 1G -i 'fstest/*'
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-o", "--output_path", type=str, help="The output path to write data to."
)
parser.add_argument("-o", "--output_path", type=str, help="The output path to write data to.")
parser.add_argument(
"-i",
"--input_path",
type=str,
help="The input path to read data from. If -o is provided, this is ignored.",
)
parser.add_argument(
"-j", "--npartitions", type=int, help="The number of parallel jobs", default=10
)
parser.add_argument("-j", "--npartitions", type=int, help="The number of parallel jobs", default=10)
parser.add_argument(
"-s",
"--size",

View File

@@ -52,9 +52,7 @@ def shuffle_data(
npartitions=num_out_data_partitions,
partition_by_rows=True,
)
shuffled_urls = StreamCopy(
ctx, (repartitioned,), output_name="data_copy", cpu_limit=1
)
shuffled_urls = StreamCopy(ctx, (repartitioned,), output_name="data_copy", cpu_limit=1)
plan = LogicalPlan(ctx, shuffled_urls)
return plan
@@ -66,9 +64,7 @@ def main():
driver.add_argument("-nd", "--num_data_partitions", type=int, default=1024)
driver.add_argument("-nh", "--num_hash_partitions", type=int, default=3840)
driver.add_argument("-no", "--num_out_data_partitions", type=int, default=1920)
driver.add_argument(
"-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow")
)
driver.add_argument("-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow"))
driver.add_argument("-x", "--skip_hash_partition", action="store_true")
plan = shuffle_data(**driver.get_arguments())
driver.run(plan)

View File

@@ -11,9 +11,7 @@ from smallpond.logical.node import (
)
def shuffle_mock_urls(
input_paths, npartitions: int = 10, sort_rand_keys=True, engine_type="duckdb"
) -> LogicalPlan:
def shuffle_mock_urls(input_paths, npartitions: int = 10, sort_rand_keys=True, engine_type="duckdb") -> LogicalPlan:
ctx = Context()
dataset = ParquetDataSet(input_paths)
data_files = DataSourceNode(ctx, dataset)
@@ -61,9 +59,7 @@ def main():
driver.add_argument("-i", "--input_paths", nargs="+")
driver.add_argument("-n", "--npartitions", type=int, default=500)
driver.add_argument("-s", "--sort_rand_keys", action="store_true")
driver.add_argument(
"-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow")
)
driver.add_argument("-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow"))
plan = shuffle_mock_urls(**driver.get_arguments())
driver.run(plan)

View File

@@ -20,9 +20,7 @@ from smallpond.logical.node import (
class SortUrlsNode(ArrowComputeNode):
def process(
self, runtime_ctx: RuntimeContext, input_tables: List[arrow.Table]
) -> arrow.Table:
def process(self, runtime_ctx: RuntimeContext, input_tables: List[arrow.Table]) -> arrow.Table:
logging.info(f"sorting urls by 'host', table shape: {input_tables[0].shape}")
return input_tables[0].sort_by("host")
@@ -90,9 +88,7 @@ def sort_mock_urls(
def main():
driver = Driver()
driver.add_argument(
"-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"]
)
driver.add_argument("-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"])
driver.add_argument("-n", "--npartitions", type=int, default=10)
driver.add_argument("-e", "--engine_type", default="duckdb")

View File

@@ -5,12 +5,8 @@ import smallpond
from smallpond.dataframe import Session
def sort_mock_urls_v2(
sp: Session, input_paths: List[str], output_path: str, npartitions: int
):
dataset = sp.read_csv(
input_paths, schema={"urlstr": "varchar", "valstr": "varchar"}, delim=r"\t"
).repartition(npartitions)
def sort_mock_urls_v2(sp: Session, input_paths: List[str], output_path: str, npartitions: int):
dataset = sp.read_csv(input_paths, schema={"urlstr": "varchar", "valstr": "varchar"}, delim=r"\t").repartition(npartitions)
urls = dataset.map(
"""
split_part(urlstr, '/', 1) as host,
@@ -25,9 +21,7 @@ def sort_mock_urls_v2(
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"]
)
parser.add_argument("-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"])
parser.add_argument("-o", "--output_path", type=str, default="sort_mock_urls")
parser.add_argument("-n", "--npartitions", type=int, default=10)
args = parser.parse_args()