I just did some testing and noticed that something is making Dask to consume 10x more time with Panel vs. without Panel
The data processing part takes:
- with pandas only about 1.5 seconds
- with pandas + Panel about 3 seconds
- with Dask only about 3 seconds (some overhead cost using Dask is understandable)
- with Dask + Panel about 30 seconds
Reproducable example showing that Dask takes only 3 seconds to process the data
- Use the same
testdata.parquet
as in previous posts in this thread. - Run with
python dasktest.py testdata.parquet
This prints out
get_data: 0.12 s
create_data_for_graph: 0.03 s
compute: 3.18 s
Code:
# dasktest.py
from __future__ import annotations
import sys
import dask.dataframe as dd
import time
import panel as pn
@pn.cache
def get_data() -> dd.DataFrame:
df = dd.read_parquet(sys.argv[1])
for i in range(15):
df[f"extra_col{i}"] = df["val_A"] + i
return df
def create_data_for_graph(
df: dd.DataFrame,
minute_of_day: tuple[float, float],
identifiers: list[int],
):
minute_of_day = tuple(map(round, minute_of_day))
mae_a = calculate_mae(
df=df,
label="A",
minute_of_day=minute_of_day,
identifiers=identifiers,
)
mae_b = calculate_mae(
df=df,
label="B",
minute_of_day=minute_of_day,
identifiers=identifiers,
)
return mae_a, mae_b
def calculate_mae(
df: dd.DataFrame,
minute_of_day: tuple[int, int],
identifiers: list[int],
label: str,
) -> dd.Series:
df_filtered = _filter_based_on_ui_selections(
df, minute_of_day=minute_of_day, identifiers=identifiers
)
mae = do_calculate_mae(df_filtered, label=label)
return mae
def _filter_based_on_ui_selections(
df: dd.DataFrame,
minute_of_day: tuple[int, int],
identifiers: list[int],
) -> dd.DataFrame:
df_filtered = df[df["minute_of_day"].between(*minute_of_day)]
return df_filtered[df_filtered["identifier"].isin(identifiers)]
def do_calculate_mae(
df: dd.DataFrame,
label: str,
) -> dd.Series:
col_err = f"err_{label}"
df[col_err] = abs(df[f"val_{label}"] - df["val_C"])
mae = df.groupby("minute_of_day")[col_err].agg("mean")
return mae
t0 = time.time()
df = get_data()
t_get_data = time.time()
mae_a, mae_b = create_data_for_graph(df, minute_of_day=(0, 1440), identifiers=[21, 79])
t_create_data_for_graph = time.time()
mae_a = mae_a.compute()
mae_b = mae_b.compute()
t_compute = time.time()
print(f"get_data: {t_get_data - t0:.2f} s")
print(f"create_data_for_graph: {t_create_data_for_graph - t_get_data:.2f} s")
print(f"compute: {t_compute - t_create_data_for_graph:.2f} s")
breakpoint()
Question
- Why there is so much overhead added when Dask is used with Panel?