Export pivoted table into csv

Bug description

Function: pivot_df

Error: The function pivot_df raised a KeyError when trying to pivot the DataFrame due to a missing column.

Log:

/app/superset/charts/post_processing.py, line 314, in apply_post_process
processed_df = post_processor(df, form_data, datasource)
File "/app/superset/charts/post_processing.py", line 233, in pivot_table_v2
return pivot_df(
File "/app/superset/charts/post_processing.py", line 90, in pivot_df
df = df.pivot_table(
File "/usr/local/lib/python3.10/site-packages/pandas/core/frame.py", line 8579, in pivot_table
return pivot_table(
File "/usr/local/lib/python3.10/site-packages/pandas/core/reshape/pivot.py", line 97, in pivot_table
table = __internal_pivot_table(
File "/usr/local/lib/python3.10/site-packages/pandas/core/reshape/pivot.py", line 143, in __internal_pivot_table
raise KeyError(i)
KeyError: 'sum__num'

Steps to reproduce error:

Open a default chart in Superset of the Pivot Table type. In this example, we are using Pivot Table v2 within the USA Births Names dashboard:

Click on Download > Export to Pivoted .CSV
Download is blocked by an error.

Cause: The error is generated by an anomaly in the input DataFrame df, which has the following format (a single column with all distinct fields separated by a semicolon separator):

,state;name;sum__num
0,other;Michael;1047996
1,other;Christopher;803607
2,other;James;749686

Fix: Added a bug fix to split the first column using ";" and expand it into multiple columns, then reassign the original column names.

Code Changes:


###### BUGFIX ##############
# Split the first column using ";" and expand it into multiple columns
# Backup the original column names to reassign them later
columns_backup = df.columns[0].split(";")
df = df.iloc[:,0].str.split(';', expand=True)
df.columns = columns_backup
############################

Complete Code

def pivot_df(  # pylint: disable=too-many-locals, too-many-arguments, too-many-statements, too-many-branches
    df: pd.DataFrame,
    rows: list[str],
    columns: list[str],
    metrics: list[str],
    aggfunc: str = "Sum",
    transpose_pivot: bool = False,
    combine_metrics: bool = False,
    show_rows_total: bool = False,
    show_columns_total: bool = False,
    apply_metrics_on_rows: bool = False,
) -> pd.DataFrame:
    metric_name = __("Total (%(aggfunc)s)", aggfunc=aggfunc)

    if transpose_pivot:
        rows, columns = columns, rows

    # to apply the metrics on the rows we pivot the dataframe, apply the
    # metrics to the columns, and pivot the dataframe back before
    # returning it
    if apply_metrics_on_rows:
        rows, columns = columns, rows
        axis = {"columns": 0, "rows": 1}
    else:
        axis = {"columns": 1, "rows": 0}

    # pivot data; we'll compute totals and subtotals later
    if rows or columns:
        ###### BUGFIX ##############
        # Split the first column using ";" and expand it into multiple columns
        # Backup the original column names to reassign them later
        columns_backup = df.columns[0].split(";")
        df = df.iloc[:,0].str.split(';', expand=True)
        df.columns = columns_backup
        ############################
        # pivoting with null values will create an empty df
        df = df.fillna("NULL")
        df = df.pivot_table(
            index=rows,
            columns=columns,
            values=metrics,
            aggfunc=pivot_v2_aggfunc_map[aggfunc],
            margins=False,
        )
    else:
        # if there's no rows nor columns we have a single value; update
        # the index with the metric name so it shows up in the table
        df.index = pd.Index([*df.index[:-1], metric_name], name="metric")

    # if no rows were passed the metrics will be in the rows, so we
    # need to move them back to columns
    if columns and not rows:
        df = df.stack()
        if not isinstance(df, pd.DataFrame):
            df = df.to_frame()
        df = df.T
        df = df[metrics]
        df.index = pd.Index([*df.index[:-1], metric_name], name="metric")

    # combining metrics changes the column hierarchy, moving the metric
    # from the top to the bottom, eg:
    #
    # ('SUM(col)', 'age', 'name') => ('age', 'name', 'SUM(col)')
    if combine_metrics and isinstance(df.columns, pd.MultiIndex):
        # move metrics to the lowest level
        new_order = [*range(1, df.columns.nlevels), 0]
        df = df.reorder_levels(new_order, axis=1)

        # sort columns, combining metrics for each group
        decorated_columns = [(col, i) for i, col in enumerate(df.columns)]
        grouped_columns = sorted(
            decorated_columns, key=lambda t: get_column_key(t[0], metrics)
        )
        indexes = [i for col, i in grouped_columns]
        df = df[df.columns[indexes]]
    elif rows:
        # if metrics were not combined we sort the dataframe by the list
        # of metrics defined by the user
        df = df[metrics]

    # compute fractions, if needed
    if aggfunc.endswith(" as Fraction of Total"):
        total = df.sum().sum()
        df = df.astype(total.dtypes) / total
    elif aggfunc.endswith(" as Fraction of Columns"):
        total = df.sum(axis=axis["rows"])
        df = df.astype(total.dtypes).div(total, axis=axis["columns"])
    elif aggfunc.endswith(" as Fraction of Rows"):
        total = df.sum(axis=axis["columns"])
        df = df.astype(total.dtypes).div(total, axis=axis["rows"])

    # convert to a MultiIndex to simplify logic
    if not isinstance(df.index, pd.MultiIndex):
        df.index = pd.MultiIndex.from_tuples([(str(i),) for i in df.index])
    if not isinstance(df.columns, pd.MultiIndex):
        df.columns = pd.MultiIndex.from_tuples([(str(i),) for i in df.columns])

    if show_rows_total:
        # add subtotal for each group and overall total; we start from the
        # overall group, and iterate deeper into subgroups
        groups = df.columns
        for level in range(df.columns.nlevels):
            subgroups = {group[:level] for group in groups}
            for subgroup in subgroups:
                slice_ = df.columns.get_loc(subgroup)
                subtotal = pivot_v2_aggfunc_map[aggfunc](df.iloc[:, slice_], axis=1)
                depth = df.columns.nlevels - len(subgroup) - 1
                total = metric_name if level == 0 else __("Subtotal")
                subtotal_name = tuple([*subgroup, total, *([""] * depth)])
                # insert column after subgroup
                df.insert(int(slice_.stop), subtotal_name, subtotal)

    if rows and show_columns_total:
        # add subtotal for each group and overall total; we start from the
        # overall group, and iterate deeper into subgroups
        groups = df.index
        for level in range(df.index.nlevels):
            subgroups = {group[:level] for group in groups}
            for subgroup in subgroups:
                slice_ = df.index.get_loc(subgroup)
                subtotal = pivot_v2_aggfunc_map[aggfunc](
                    df.iloc[slice_, :].apply(pd.to_numeric), axis=0
                )
                depth = df.index.nlevels - len(subgroup) - 1
                total = metric_name if level == 0 else __("Subtotal")
                subtotal.name = tuple([*subgroup, total, *([""] * depth)])
                # insert row after subgroup
                df = pd.concat(
                    [df[: slice_.stop], subtotal.to_frame().T, df[slice_.stop :]]
                )

    # if we want to apply the metrics on the rows we need to pivot the
    # dataframe back
    if apply_metrics_on_rows:
        df = df.T

    return df

Screenshots/recordings

No response

Superset version

4.0.2

Python version

3.10

Node version

Browser

Chrome

Additional context

No response

Checklist

[X] I have searched Superset docs and Slack and didn't find a solution to my problem.
[X] I have searched the GitHub issue tracker and didn't find a similar bug report.
[X] I have checked Superset's logs for errors and if I found a relevant Python stacktrace, I included it here as text in the "additional context" section.

apache / superset