Files
smallpond/api/dataframe.html
2025-03-06 02:23:52 +08:00

763 lines
47 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>DataFrame &#8212; smallpond documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=5b4479735964841361fd" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=5b4479735964841361fd" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=5b4479735964841361fd" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=5b4479735964841361fd" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=5b4479735964841361fd" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=5b4479735964841361fd" />
<script src="../_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=5b4479735964841361fd"></script>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js?v=b3ba4146"></script>
<script src="../_static/doctools.js?v=888ff710"></script>
<script src="../_static/sphinx_highlight.js?v=4825356b"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'api/dataframe';</script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="smallpond.init" href="../generated/smallpond.init.html" />
<link rel="prev" title="API Reference" href="../api.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<p class="title logo__title">smallpond documentation</p>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../getstarted.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../internals.html">
Internals
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="../api.html">
API Reference
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/deepseek-ai/smallpond" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../getstarted.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../internals.html">
Internals
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="../api.html">
API Reference
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/deepseek-ai/smallpond" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-square-github fa-lg" aria-hidden="true"></i></span>
<span class="sr-only">GitHub</span></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1 current active has-children"><a class="current reference internal" href="#">DataFrame</a><input checked="" class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-1"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.init.html">smallpond.init</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_items.html">smallpond.dataframe.Session.from_items</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_arrow.html">smallpond.dataframe.Session.from_arrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_pandas.html">smallpond.dataframe.Session.from_pandas</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_csv.html">smallpond.dataframe.Session.read_csv</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_json.html">smallpond.dataframe.Session.read_json</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_parquet.html">smallpond.dataframe.Session.read_parquet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.repartition.html">smallpond.dataframe.DataFrame.repartition</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.partial_sql.html">smallpond.dataframe.Session.partial_sql</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.map.html">smallpond.dataframe.DataFrame.map</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.map_batches.html">smallpond.dataframe.DataFrame.map_batches</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.flat_map.html">smallpond.dataframe.DataFrame.flat_map</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.filter.html">smallpond.dataframe.DataFrame.filter</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.limit.html">smallpond.dataframe.DataFrame.limit</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.partial_sort.html">smallpond.dataframe.DataFrame.partial_sort</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.random_shuffle.html">smallpond.dataframe.DataFrame.random_shuffle</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.count.html">smallpond.dataframe.DataFrame.count</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.take.html">smallpond.dataframe.DataFrame.take</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.take_all.html">smallpond.dataframe.DataFrame.take_all</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.to_arrow.html">smallpond.dataframe.DataFrame.to_arrow</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.to_pandas.html">smallpond.dataframe.DataFrame.to_pandas</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.write_parquet.html">smallpond.dataframe.DataFrame.write_parquet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.write_parquet_lazy.html">smallpond.dataframe.DataFrame.write_parquet_lazy</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.compute.html">smallpond.dataframe.DataFrame.compute</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.is_computed.html">smallpond.dataframe.DataFrame.is_computed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.recompute.html">smallpond.dataframe.DataFrame.recompute</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.dataframe.Session.wait.html">smallpond.dataframe.Session.wait</a></li>
</ul>
</li>
</ul>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children"><a class="reference internal" href="dataset.html">Dataset</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-2"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.DataSet.html">smallpond.logical.dataset.DataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.FileSet.html">smallpond.logical.dataset.FileSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.ParquetDataSet.html">smallpond.logical.dataset.ParquetDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.CsvDataSet.html">smallpond.logical.dataset.CsvDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.JsonDataSet.html">smallpond.logical.dataset.JsonDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.ArrowTableDataSet.html">smallpond.logical.dataset.ArrowTableDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.PandasDataSet.html">smallpond.logical.dataset.PandasDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.PartitionedDataSet.html">smallpond.logical.dataset.PartitionedDataSet</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.dataset.SqlQueryDataSet.html">smallpond.logical.dataset.SqlQueryDataSet</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="nodes.html">Nodes</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-3"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.Context.html">smallpond.logical.node.Context</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.NodeId.html">smallpond.logical.node.NodeId</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.LogicalPlan.html">smallpond.logical.node.LogicalPlan</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.LogicalPlanVisitor.html">smallpond.logical.node.LogicalPlanVisitor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.Node.html">smallpond.logical.node.Node</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.DataSetPartitionNode.html">smallpond.logical.node.DataSetPartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ArrowBatchNode.html">smallpond.logical.node.ArrowBatchNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ArrowComputeNode.html">smallpond.logical.node.ArrowComputeNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ArrowStreamNode.html">smallpond.logical.node.ArrowStreamNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ConsolidateNode.html">smallpond.logical.node.ConsolidateNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.DataSinkNode.html">smallpond.logical.node.DataSinkNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.DataSourceNode.html">smallpond.logical.node.DataSourceNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.EvenlyDistributedPartitionNode.html">smallpond.logical.node.EvenlyDistributedPartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.HashPartitionNode.html">smallpond.logical.node.HashPartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.LimitNode.html">smallpond.logical.node.LimitNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.LoadPartitionedDataSetNode.html">smallpond.logical.node.LoadPartitionedDataSetNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.PandasBatchNode.html">smallpond.logical.node.PandasBatchNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.PandasComputeNode.html">smallpond.logical.node.PandasComputeNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.PartitionNode.html">smallpond.logical.node.PartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ProjectionNode.html">smallpond.logical.node.ProjectionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.PythonScriptNode.html">smallpond.logical.node.PythonScriptNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.RangePartitionNode.html">smallpond.logical.node.RangePartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.RepeatPartitionNode.html">smallpond.logical.node.RepeatPartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.RootNode.html">smallpond.logical.node.RootNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.ShuffleNode.html">smallpond.logical.node.ShuffleNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.SqlEngineNode.html">smallpond.logical.node.SqlEngineNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.UnionNode.html">smallpond.logical.node.UnionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.UserDefinedPartitionNode.html">smallpond.logical.node.UserDefinedPartitionNode</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.logical.node.UserPartitionedDataSourceNode.html">smallpond.logical.node.UserPartitionedDataSourceNode</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="tasks.html">Tasks</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-4"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.RuntimeContext.html">smallpond.execution.task.RuntimeContext</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.JobId.html">smallpond.execution.task.JobId</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.TaskId.html">smallpond.execution.task.TaskId</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.TaskRuntimeId.html">smallpond.execution.task.TaskRuntimeId</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PartitionInfo.html">smallpond.execution.task.PartitionInfo</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PerfStats.html">smallpond.execution.task.PerfStats</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.ExecutionPlan.html">smallpond.execution.task.ExecutionPlan</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.Task.html">smallpond.execution.task.Task</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.ArrowBatchTask.html">smallpond.execution.task.ArrowBatchTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.ArrowComputeTask.html">smallpond.execution.task.ArrowComputeTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.ArrowStreamTask.html">smallpond.execution.task.ArrowStreamTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.DataSinkTask.html">smallpond.execution.task.DataSinkTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.DataSourceTask.html">smallpond.execution.task.DataSourceTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.EvenlyDistributedPartitionProducerTask.html">smallpond.execution.task.EvenlyDistributedPartitionProducerTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.HashPartitionArrowTask.html">smallpond.execution.task.HashPartitionArrowTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.HashPartitionDuckDbTask.html">smallpond.execution.task.HashPartitionDuckDbTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.HashPartitionTask.html">smallpond.execution.task.HashPartitionTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.LoadPartitionedDataSetProducerTask.html">smallpond.execution.task.LoadPartitionedDataSetProducerTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.MergeDataSetsTask.html">smallpond.execution.task.MergeDataSetsTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PandasBatchTask.html">smallpond.execution.task.PandasBatchTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PandasComputeTask.html">smallpond.execution.task.PandasComputeTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PartitionConsumerTask.html">smallpond.execution.task.PartitionConsumerTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PartitionProducerTask.html">smallpond.execution.task.PartitionProducerTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.ProjectionTask.html">smallpond.execution.task.ProjectionTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.PythonScriptTask.html">smallpond.execution.task.PythonScriptTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.RangePartitionTask.html">smallpond.execution.task.RangePartitionTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.RepeatPartitionProducerTask.html">smallpond.execution.task.RepeatPartitionProducerTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.RootTask.html">smallpond.execution.task.RootTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.SplitDataSetTask.html">smallpond.execution.task.SplitDataSetTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.SqlEngineTask.html">smallpond.execution.task.SqlEngineTask</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.task.UserDefinedPartitionProducerTask.html">smallpond.execution.task.UserDefinedPartitionProducerTask</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="execution.html">Execution</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-5"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.driver.Driver.html">smallpond.execution.driver.Driver</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.manager.JobManager.html">smallpond.execution.manager.JobManager</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.scheduler.Scheduler.html">smallpond.execution.scheduler.Scheduler</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.execution.executor.Executor.html">smallpond.execution.executor.Executor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.platform.Platform.html">smallpond.platform.Platform</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generated/smallpond.platform.MPI.html">smallpond.platform.MPI</a></li>
</ul>
</li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../api.html" class="nav-link">API Reference</a></li>
<li class="breadcrumb-item active" aria-current="page">DataFrame</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="dataframe">
<span id="id1"></span><h1>DataFrame<a class="headerlink" href="#dataframe" title="Permalink to this heading">#</a></h1>
<p>DataFrame is the main class in smallpond. It represents a lazily computed, partitioned data set.</p>
<p>A typical workflow looks like this:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">smallpond</span>
<span class="n">sp</span> <span class="o">=</span> <span class="n">smallpond</span><span class="o">.</span><span class="n">init</span><span class="p">()</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">sp</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s2">&quot;path/to/dataset/*.parquet&quot;</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="s2">&quot;x + 1&quot;</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">write_parquet</span><span class="p">(</span><span class="s2">&quot;path/to/output&quot;</span><span class="p">)</span>
</pre></div>
</div>
<section id="initialization">
<h2>Initialization<a class="headerlink" href="#initialization" title="Permalink to this heading">#</a></h2>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.init.html#smallpond.init" title="smallpond.init"><code class="xref py py-obj docutils literal notranslate"><span class="pre">smallpond.init</span></code></a>([job_id, job_time, job_name, ...])</p></td>
<td><p>Initialize smallpond environment.</p></td>
</tr>
</tbody>
</table>
</section>
<section id="loading-data">
<span id="id2"></span><h2>Loading Data<a class="headerlink" href="#loading-data" title="Permalink to this heading">#</a></h2>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_items.html#smallpond.dataframe.Session.from_items" title="smallpond.dataframe.Session.from_items"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.from_items</span></code></a>(items)</p></td>
<td><p>Create a DataFrame from a list of local Python objects.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_arrow.html#smallpond.dataframe.Session.from_arrow" title="smallpond.dataframe.Session.from_arrow"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.from_arrow</span></code></a>(table)</p></td>
<td><p>Create a DataFrame from a pyarrow Table.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.from_pandas.html#smallpond.dataframe.Session.from_pandas" title="smallpond.dataframe.Session.from_pandas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.from_pandas</span></code></a>(df)</p></td>
<td><p>Create a DataFrame from a pandas DataFrame.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_csv.html#smallpond.dataframe.Session.read_csv" title="smallpond.dataframe.Session.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.read_csv</span></code></a>(paths, schema[, delim])</p></td>
<td><p>Create a DataFrame from CSV files.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_json.html#smallpond.dataframe.Session.read_json" title="smallpond.dataframe.Session.read_json"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.read_json</span></code></a>(paths, schema)</p></td>
<td><p>Create a DataFrame from JSON files.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.read_parquet.html#smallpond.dataframe.Session.read_parquet" title="smallpond.dataframe.Session.read_parquet"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.read_parquet</span></code></a>(paths[, recursive, ...])</p></td>
<td><p>Create a DataFrame from Parquet files.</p></td>
</tr>
</tbody>
</table>
</section>
<section id="partitioning-data">
<span id="id3"></span><h2>Partitioning Data<a class="headerlink" href="#partitioning-data" title="Permalink to this heading">#</a></h2>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.repartition.html#smallpond.dataframe.DataFrame.repartition" title="smallpond.dataframe.DataFrame.repartition"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.repartition</span></code></a>(npartitions[, ...])</p></td>
<td><p>Repartition the data into the given number of partitions.</p></td>
</tr>
</tbody>
</table>
</section>
<section id="transformations">
<span id="id4"></span><h2>Transformations<a class="headerlink" href="#transformations" title="Permalink to this heading">#</a></h2>
<p>Apply transformations and return a new DataFrame.</p>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.partial_sql.html#smallpond.dataframe.Session.partial_sql" title="smallpond.dataframe.Session.partial_sql"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.partial_sql</span></code></a>(query, *inputs, **kwargs)</p></td>
<td><p>Execute a SQL query on each partition of the input DataFrames.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.map.html#smallpond.dataframe.DataFrame.map" title="smallpond.dataframe.DataFrame.map"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.map</span></code></a>(sql_or_func, *[, schema])</p></td>
<td><p>Apply a function to each row.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.map_batches.html#smallpond.dataframe.DataFrame.map_batches" title="smallpond.dataframe.DataFrame.map_batches"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.map_batches</span></code></a>(func, *[, batch_size])</p></td>
<td><p>Apply the given function to batches of data.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.flat_map.html#smallpond.dataframe.DataFrame.flat_map" title="smallpond.dataframe.DataFrame.flat_map"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.flat_map</span></code></a>(sql_or_func, *[, schema])</p></td>
<td><p>Apply a function to each row and flatten the result.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.filter.html#smallpond.dataframe.DataFrame.filter" title="smallpond.dataframe.DataFrame.filter"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.filter</span></code></a>(sql_or_func, **kwargs)</p></td>
<td><p>Filter out rows that don't satisfy the given predicate.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.limit.html#smallpond.dataframe.DataFrame.limit" title="smallpond.dataframe.DataFrame.limit"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.limit</span></code></a>(limit)</p></td>
<td><p>Limit the number of rows to the given number.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.partial_sort.html#smallpond.dataframe.DataFrame.partial_sort" title="smallpond.dataframe.DataFrame.partial_sort"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.partial_sort</span></code></a>(by, **kwargs)</p></td>
<td><p>Sort rows by the given columns in each partition.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.random_shuffle.html#smallpond.dataframe.DataFrame.random_shuffle" title="smallpond.dataframe.DataFrame.random_shuffle"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.random_shuffle</span></code></a>(**kwargs)</p></td>
<td><p>Randomly shuffle all rows globally.</p></td>
</tr>
</tbody>
</table>
</section>
<section id="consuming-data">
<span id="id5"></span><h2>Consuming Data<a class="headerlink" href="#consuming-data" title="Permalink to this heading">#</a></h2>
<p>These operations will trigger execution of the lazy transformations performed on this DataFrame.</p>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.count.html#smallpond.dataframe.DataFrame.count" title="smallpond.dataframe.DataFrame.count"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.count</span></code></a>()</p></td>
<td><p>Count the number of rows.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.take.html#smallpond.dataframe.DataFrame.take" title="smallpond.dataframe.DataFrame.take"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.take</span></code></a>(limit)</p></td>
<td><p>Return up to <cite>limit</cite> rows.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.take_all.html#smallpond.dataframe.DataFrame.take_all" title="smallpond.dataframe.DataFrame.take_all"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.take_all</span></code></a>()</p></td>
<td><p>Return all rows.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.to_arrow.html#smallpond.dataframe.DataFrame.to_arrow" title="smallpond.dataframe.DataFrame.to_arrow"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.to_arrow</span></code></a>()</p></td>
<td><p>Convert to an arrow Table.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.to_pandas.html#smallpond.dataframe.DataFrame.to_pandas" title="smallpond.dataframe.DataFrame.to_pandas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.to_pandas</span></code></a>()</p></td>
<td><p>Convert to a pandas DataFrame.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.write_parquet.html#smallpond.dataframe.DataFrame.write_parquet" title="smallpond.dataframe.DataFrame.write_parquet"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.write_parquet</span></code></a>(path)</p></td>
<td><p>Write data to a series of parquet files under the given path.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.write_parquet_lazy.html#smallpond.dataframe.DataFrame.write_parquet_lazy" title="smallpond.dataframe.DataFrame.write_parquet_lazy"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.write_parquet_lazy</span></code></a>(path)</p></td>
<td><p>Write data to a series of parquet files under the given path.</p></td>
</tr>
</tbody>
</table>
</section>
<section id="execution">
<h2>Execution<a class="headerlink" href="#execution" title="Permalink to this heading">#</a></h2>
<p>DataFrames are lazily computed. You can use these methods to manually trigger computation.</p>
<table class="autosummary longtable table autosummary">
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.compute.html#smallpond.dataframe.DataFrame.compute" title="smallpond.dataframe.DataFrame.compute"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.compute</span></code></a>()</p></td>
<td><p>Compute the data.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.is_computed.html#smallpond.dataframe.DataFrame.is_computed" title="smallpond.dataframe.DataFrame.is_computed"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.is_computed</span></code></a>()</p></td>
<td><p>Check if the data is ready on disk.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.DataFrame.recompute.html#smallpond.dataframe.DataFrame.recompute" title="smallpond.dataframe.DataFrame.recompute"><code class="xref py py-obj docutils literal notranslate"><span class="pre">DataFrame.recompute</span></code></a>()</p></td>
<td><p>Always recompute the data regardless of whether it's already computed.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="../generated/smallpond.dataframe.Session.wait.html#smallpond.dataframe.Session.wait" title="smallpond.dataframe.Session.wait"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Session.wait</span></code></a>(*dfs)</p></td>
<td><p>Wait for all DataFrames to be computed.</p></td>
</tr>
</tbody>
</table>
</section>
</section>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
<a class="left-prev"
href="../api.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">API Reference</p>
</div>
</a>
<a class="right-next"
href="../generated/smallpond.init.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">smallpond.init</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#initialization">Initialization</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#loading-data">Loading Data</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#partitioning-data">Partitioning Data</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#transformations">Transformations</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#consuming-data">Consuming Data</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#execution">Execution</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/api/dataframe.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=5b4479735964841361fd"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=5b4479735964841361fd"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright 2025, deepseek.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 7.1.2.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.14.4.
</p></div>
</div>
</div>
</footer>
</body>
</html>