Why Panel is 10x slower with Dask than with Pandas?

I just did some testing and noticed that something is making Dask to consume 10x more time with Panel vs. without Panel

The data processing part takes:

  • with pandas only about 1.5 seconds
  • with pandas + Panel about 3 seconds
  • with Dask only about 3 seconds (some overhead cost using Dask is understandable)
  • with Dask + Panel about 30 seconds
Reproducable example showing that Dask takes only 3 seconds to process the data
  • Use the same testdata.parquet as in previous posts in this thread.
  • Run with python dasktest.py testdata.parquet

This prints out

get_data: 0.12 s
create_data_for_graph: 0.03 s
compute: 3.18 s

Code:

# dasktest.py
from __future__ import annotations

import sys
import dask.dataframe as dd
import time

import panel as pn


@pn.cache
def get_data() -> dd.DataFrame:
    df = dd.read_parquet(sys.argv[1])
    for i in range(15):
        df[f"extra_col{i}"] = df["val_A"] + i
    return df


def create_data_for_graph(
    df: dd.DataFrame,
    minute_of_day: tuple[float, float],
    identifiers: list[int],
):

    minute_of_day = tuple(map(round, minute_of_day))

    mae_a = calculate_mae(
        df=df,
        label="A",
        minute_of_day=minute_of_day,
        identifiers=identifiers,
    )
    mae_b = calculate_mae(
        df=df,
        label="B",
        minute_of_day=minute_of_day,
        identifiers=identifiers,
    )

    return mae_a, mae_b


def calculate_mae(
    df: dd.DataFrame,
    minute_of_day: tuple[int, int],
    identifiers: list[int],
    label: str,
) -> dd.Series:
    df_filtered = _filter_based_on_ui_selections(
        df, minute_of_day=minute_of_day, identifiers=identifiers
    )
    mae = do_calculate_mae(df_filtered, label=label)
    return mae


def _filter_based_on_ui_selections(
    df: dd.DataFrame,
    minute_of_day: tuple[int, int],
    identifiers: list[int],
) -> dd.DataFrame:

    df_filtered = df[df["minute_of_day"].between(*minute_of_day)]
    return df_filtered[df_filtered["identifier"].isin(identifiers)]


def do_calculate_mae(
    df: dd.DataFrame,
    label: str,
) -> dd.Series:

    col_err = f"err_{label}"
    df[col_err] = abs(df[f"val_{label}"] - df["val_C"])
    mae = df.groupby("minute_of_day")[col_err].agg("mean")

    return mae


t0 = time.time()
df = get_data()
t_get_data = time.time()
mae_a, mae_b = create_data_for_graph(df, minute_of_day=(0, 1440), identifiers=[21, 79])
t_create_data_for_graph = time.time()
mae_a = mae_a.compute()
mae_b = mae_b.compute()
t_compute = time.time()

print(f"get_data: {t_get_data - t0:.2f} s")
print(f"create_data_for_graph: {t_create_data_for_graph - t_get_data:.2f} s")
print(f"compute: {t_compute - t_create_data_for_graph:.2f} s")
breakpoint()

Question

  • Why there is so much overhead added when Dask is used with Panel?