mirror of
https://github.com/deepseek-ai/smallpond
synced 2025-06-26 18:27:45 +00:00
reformat code with --line-length=150 (#18)
This commit is contained in:
@@ -80,9 +80,7 @@ def check_data(actual: bytes, expected: bytes, offset: int) -> None:
|
||||
)
|
||||
expected = expected[index : index + 16]
|
||||
actual = actual[index : index + 16]
|
||||
raise ValueError(
|
||||
f"Data mismatch at offset {offset + index}.\nexpect: {expected}\nactual: {actual}"
|
||||
)
|
||||
raise ValueError(f"Data mismatch at offset {offset + index}.\nexpect: {expected}\nactual: {actual}")
|
||||
|
||||
|
||||
def generate_data(offset: int, length: int) -> bytes:
|
||||
@@ -92,16 +90,10 @@ def generate_data(offset: int, length: int) -> bytes:
|
||||
"""
|
||||
istart = offset // 4
|
||||
iend = (offset + length + 3) // 4
|
||||
return (
|
||||
np.arange(istart, iend)
|
||||
.astype(np.uint32)
|
||||
.tobytes()[offset % 4 : offset % 4 + length]
|
||||
)
|
||||
return np.arange(istart, iend).astype(np.uint32).tobytes()[offset % 4 : offset % 4 + length]
|
||||
|
||||
|
||||
def iter_io_slice(
|
||||
offset: int, length: int, block_size: Union[int, Tuple[int, int]]
|
||||
) -> Iterator[Tuple[int, int]]:
|
||||
def iter_io_slice(offset: int, length: int, block_size: Union[int, Tuple[int, int]]) -> Iterator[Tuple[int, int]]:
|
||||
"""
|
||||
Generate the IO (offset, size) for the slice [offset, offset + length) with the given block size.
|
||||
`block_size` can be an integer or a range [start, end]. If a range is provided, the IO size will be randomly selected from the range.
|
||||
@@ -161,9 +153,7 @@ def fstest(
|
||||
|
||||
if output_path is not None:
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
df = sp.from_items(
|
||||
[{"path": os.path.join(output_path, f"{i}")} for i in range(npartitions)]
|
||||
)
|
||||
df = sp.from_items([{"path": os.path.join(output_path, f"{i}")} for i in range(npartitions)])
|
||||
df = df.repartition(npartitions, by_rows=True)
|
||||
stats = df.map(lambda x: fswrite(x["path"], size, blocksize)).to_pandas()
|
||||
logging.info(f"write stats:\n{stats}")
|
||||
@@ -187,18 +177,14 @@ if __name__ == "__main__":
|
||||
python example/fstest.py -o 'fstest' -j 8 -s 1G -i 'fstest/*'
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-o", "--output_path", type=str, help="The output path to write data to."
|
||||
)
|
||||
parser.add_argument("-o", "--output_path", type=str, help="The output path to write data to.")
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input_path",
|
||||
type=str,
|
||||
help="The input path to read data from. If -o is provided, this is ignored.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j", "--npartitions", type=int, help="The number of parallel jobs", default=10
|
||||
)
|
||||
parser.add_argument("-j", "--npartitions", type=int, help="The number of parallel jobs", default=10)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--size",
|
||||
|
||||
@@ -52,9 +52,7 @@ def shuffle_data(
|
||||
npartitions=num_out_data_partitions,
|
||||
partition_by_rows=True,
|
||||
)
|
||||
shuffled_urls = StreamCopy(
|
||||
ctx, (repartitioned,), output_name="data_copy", cpu_limit=1
|
||||
)
|
||||
shuffled_urls = StreamCopy(ctx, (repartitioned,), output_name="data_copy", cpu_limit=1)
|
||||
|
||||
plan = LogicalPlan(ctx, shuffled_urls)
|
||||
return plan
|
||||
@@ -66,9 +64,7 @@ def main():
|
||||
driver.add_argument("-nd", "--num_data_partitions", type=int, default=1024)
|
||||
driver.add_argument("-nh", "--num_hash_partitions", type=int, default=3840)
|
||||
driver.add_argument("-no", "--num_out_data_partitions", type=int, default=1920)
|
||||
driver.add_argument(
|
||||
"-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow")
|
||||
)
|
||||
driver.add_argument("-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow"))
|
||||
driver.add_argument("-x", "--skip_hash_partition", action="store_true")
|
||||
plan = shuffle_data(**driver.get_arguments())
|
||||
driver.run(plan)
|
||||
|
||||
@@ -11,9 +11,7 @@ from smallpond.logical.node import (
|
||||
)
|
||||
|
||||
|
||||
def shuffle_mock_urls(
|
||||
input_paths, npartitions: int = 10, sort_rand_keys=True, engine_type="duckdb"
|
||||
) -> LogicalPlan:
|
||||
def shuffle_mock_urls(input_paths, npartitions: int = 10, sort_rand_keys=True, engine_type="duckdb") -> LogicalPlan:
|
||||
ctx = Context()
|
||||
dataset = ParquetDataSet(input_paths)
|
||||
data_files = DataSourceNode(ctx, dataset)
|
||||
@@ -61,9 +59,7 @@ def main():
|
||||
driver.add_argument("-i", "--input_paths", nargs="+")
|
||||
driver.add_argument("-n", "--npartitions", type=int, default=500)
|
||||
driver.add_argument("-s", "--sort_rand_keys", action="store_true")
|
||||
driver.add_argument(
|
||||
"-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow")
|
||||
)
|
||||
driver.add_argument("-e", "--engine_type", default="duckdb", choices=("duckdb", "arrow"))
|
||||
|
||||
plan = shuffle_mock_urls(**driver.get_arguments())
|
||||
driver.run(plan)
|
||||
|
||||
@@ -20,9 +20,7 @@ from smallpond.logical.node import (
|
||||
|
||||
|
||||
class SortUrlsNode(ArrowComputeNode):
|
||||
def process(
|
||||
self, runtime_ctx: RuntimeContext, input_tables: List[arrow.Table]
|
||||
) -> arrow.Table:
|
||||
def process(self, runtime_ctx: RuntimeContext, input_tables: List[arrow.Table]) -> arrow.Table:
|
||||
logging.info(f"sorting urls by 'host', table shape: {input_tables[0].shape}")
|
||||
return input_tables[0].sort_by("host")
|
||||
|
||||
@@ -90,9 +88,7 @@ def sort_mock_urls(
|
||||
|
||||
def main():
|
||||
driver = Driver()
|
||||
driver.add_argument(
|
||||
"-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"]
|
||||
)
|
||||
driver.add_argument("-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"])
|
||||
driver.add_argument("-n", "--npartitions", type=int, default=10)
|
||||
driver.add_argument("-e", "--engine_type", default="duckdb")
|
||||
|
||||
|
||||
@@ -5,12 +5,8 @@ import smallpond
|
||||
from smallpond.dataframe import Session
|
||||
|
||||
|
||||
def sort_mock_urls_v2(
|
||||
sp: Session, input_paths: List[str], output_path: str, npartitions: int
|
||||
):
|
||||
dataset = sp.read_csv(
|
||||
input_paths, schema={"urlstr": "varchar", "valstr": "varchar"}, delim=r"\t"
|
||||
).repartition(npartitions)
|
||||
def sort_mock_urls_v2(sp: Session, input_paths: List[str], output_path: str, npartitions: int):
|
||||
dataset = sp.read_csv(input_paths, schema={"urlstr": "varchar", "valstr": "varchar"}, delim=r"\t").repartition(npartitions)
|
||||
urls = dataset.map(
|
||||
"""
|
||||
split_part(urlstr, '/', 1) as host,
|
||||
@@ -25,9 +21,7 @@ def sort_mock_urls_v2(
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"]
|
||||
)
|
||||
parser.add_argument("-i", "--input_paths", nargs="+", default=["tests/data/mock_urls/*.tsv"])
|
||||
parser.add_argument("-o", "--output_path", type=str, default="sort_mock_urls")
|
||||
parser.add_argument("-n", "--npartitions", type=int, default=10)
|
||||
args = parser.parse_args()
|
||||
|
||||
Reference in New Issue
Block a user