feat: post-processing for pivot table v2 (#15879)

* feat: add pivot v2 post-processing * Fix lint
2021-07-29 11:05:56 -07:00 · 2021-07-29 11:05:56 -07:00 · f4739f427e
parent 6afa840659
commit f4739f427e
3 changed files with 504 additions and 1 deletions
--- a/superset/charts/post_processing.py
+++ b/superset/charts/post_processing.py
@ -33,6 +33,13 @@ import pandas as pd
 from superset.utils.core import DTTM_ALIAS, extract_dataframe_dtypes, get_metric_name
 def sql_like_sum(series: pd.Series) -> pd.Series:
    """
    A SUM aggregation function that mimics the behavior from SQL.
    """
    return series.sum(min_count=1)
 def pivot_table(
    result: Dict[Any, Any], form_data: Optional[Dict[str, Any]] = None
 ) -> Dict[Any, Any]:
@ -53,7 +60,7 @@ def pivot_table(
            aggfunc = form_data.get("pandas_aggfunc") or "sum"
            if pd.api.types.is_numeric_dtype(df[metric]):
                if aggfunc == "sum":
-                    aggfunc = lambda x: x.sum(min_count=1)
+                    aggfunc = sql_like_sum
            elif aggfunc not in {"min", "max"}:
                aggfunc = "max"
            aggfuncs[metric] = aggfunc
@ -95,6 +102,120 @@ def pivot_table(
    return result
 def list_unique_values(series: pd.Series) -> str:
    """
    List unique values in a series.
    """
    return ", ".join(set(str(v) for v in pd.Series.unique(series)))
 pivot_v2_aggfunc_map = {
    "Count": pd.Series.count,
    "Count Unique Values": pd.Series.nunique,
    "List Unique Values": list_unique_values,
    "Sum": pd.Series.sum,
    "Average": pd.Series.mean,
    "Median": pd.Series.median,
    "Sample Variance": lambda series: pd.series.var(series) if len(series) > 1 else 0,
    "Sample Standard Deviation": (
        lambda series: pd.series.std(series) if len(series) > 1 else 0,
    ),
    "Minimum": pd.Series.min,
    "Maximum": pd.Series.max,
    "First": lambda series: series[:1],
    "Last": lambda series: series[-1:],
    "Sum as Fraction of Total": pd.Series.sum,
    "Sum as Fraction of Rows": pd.Series.sum,
    "Sum as Fraction of Columns": pd.Series.sum,
    "Count as Fraction of Total": pd.Series.count,
    "Count as Fraction of Rows": pd.Series.count,
    "Count as Fraction of Columns": pd.Series.count,
 }
 def pivot_table_v2(  # pylint: disable=too-many-branches
    result: Dict[Any, Any], form_data: Optional[Dict[str, Any]] = None,
 ) -> Dict[Any, Any]:
    """
    Pivot table v2.
    """
    for query in result["queries"]:
        data = query["data"]
        df = pd.DataFrame(data)
        form_data = form_data or {}
        if form_data.get("granularity_sqla") == "all" and DTTM_ALIAS in df:
            del df[DTTM_ALIAS]
        # TODO (betodealmeida): implement metricsLayout
        metrics = [get_metric_name(m) for m in form_data["metrics"]]
        aggregate_function = form_data.get("aggregateFunction", "Sum")
        groupby = form_data.get("groupbyRows") or []
        columns = form_data.get("groupbyColumns") or []
        if form_data.get("transposePivot"):
            groupby, columns = columns, groupby
        df = df.pivot_table(
            index=groupby,
            columns=columns,
            values=metrics,
            aggfunc=pivot_v2_aggfunc_map[aggregate_function],
            margins=True,
        )
        # The pandas `pivot_table` method either brings both row/column
        # totals, or none at all. We pass `margin=True` to get both, and
        # remove any dimension that was not requests.
        if not form_data.get("rowTotals"):
            df.drop(df.columns[len(df.columns) - 1], axis=1, inplace=True)
        if not form_data.get("colTotals"):
            df = df[:-1]
        # Compute fractions, if needed. If `colTotals` or `rowTotals` are
        # present we need to adjust for including them in the sum
        if aggregate_function.endswith(" as Fraction of Total"):
            total = df.sum().sum()
            df = df.astype(total.dtypes) / total
            if form_data.get("colTotals"):
                df *= 2
            if form_data.get("rowTotals"):
                df *= 2
        elif aggregate_function.endswith(" as Fraction of Columns"):
            total = df.sum(axis=0)
            df = df.astype(total.dtypes).div(total, axis=1)
            if form_data.get("colTotals"):
                df *= 2
        elif aggregate_function.endswith(" as Fraction of Rows"):
            total = df.sum(axis=1)
            df = df.astype(total.dtypes).div(total, axis=0)
            if form_data.get("rowTotals"):
                df *= 2
        # Re-order the columns adhering to the metric ordering.
        df = df[metrics]
        # Display metrics side by side with each column
        if form_data.get("combineMetric"):
            df = df.stack(0).unstack().reindex(level=-1, columns=metrics)
        # flatten column names
        df.columns = [" ".join(column) for column in df.columns]
        # re-arrange data into a list of dicts
        data = []
        for i in df.index:
            row = {col: df[col][i] for col in df.columns}
            row[df.index.name] = i
            data.append(row)
        query["data"] = data
        query["colnames"] = list(df.columns)
        query["coltypes"] = extract_dataframe_dtypes(df)
        query["rowcount"] = len(df.index)
    return result
 post_processors = {
    "pivot_table": pivot_table,
    "pivot_table_v2": pivot_table_v2,
 }
--- a/tests/unit_tests/charts/init.py
+++ b/tests/unit_tests/charts/init.py
@ -0,0 +1,16 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
--- a/tests/unit_tests/charts/test_post_processing.py
+++ b/tests/unit_tests/charts/test_post_processing.py
@ -0,0 +1,366 @@
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import copy
 from typing import Any, Dict
 from superset.charts.post_processing import pivot_table, pivot_table_v2
 from superset.utils.core import GenericDataType, QueryStatus
 RESULT: Dict[str, Any] = {
    "query_context": None,
    "queries": [
        {
            "cache_key": "1bd3ab8c01e98a0e349fb61bc76d9b90",
            "cached_dttm": None,
            "cache_timeout": 86400,
            "annotation_data": {},
            "error": None,
            "is_cached": None,
            "query": """SELECT state AS state,
       gender AS gender,
       sum(num) AS \"Births\"
 FROM birth_names
 WHERE ds >= TO_TIMESTAMP('1921-07-28 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US')
  AND ds < TO_TIMESTAMP('2021-07-28 10:39:44.000000', 'YYYY-MM-DD HH24:MI:SS.US')
 GROUP BY state,
         gender
 LIMIT 50000;
 """,
            "status": QueryStatus.SUCCESS,
            "stacktrace": None,
            "rowcount": 22,
            "colnames": ["state", "gender", "Births"],
            "coltypes": [
                GenericDataType.STRING,
                GenericDataType.STRING,
                GenericDataType.NUMERIC,
            ],
            "data": [
                {"state": "OH", "gender": "boy", "Births": int("2376385")},
                {"state": "TX", "gender": "girl", "Births": int("2313186")},
                {"state": "MA", "gender": "boy", "Births": int("1285126")},
                {"state": "MA", "gender": "girl", "Births": int("842146")},
                {"state": "PA", "gender": "boy", "Births": int("2390275")},
                {"state": "NY", "gender": "boy", "Births": int("3543961")},
                {"state": "FL", "gender": "boy", "Births": int("1968060")},
                {"state": "TX", "gender": "boy", "Births": int("3311985")},
                {"state": "NJ", "gender": "boy", "Births": int("1486126")},
                {"state": "CA", "gender": "girl", "Births": int("3567754")},
                {"state": "CA", "gender": "boy", "Births": int("5430796")},
                {"state": "IL", "gender": "girl", "Births": int("1614427")},
                {"state": "FL", "gender": "girl", "Births": int("1312593")},
                {"state": "NY", "gender": "girl", "Births": int("2280733")},
                {"state": "NJ", "gender": "girl", "Births": int("992702")},
                {"state": "MI", "gender": "girl", "Births": int("1326229")},
                {"state": "other", "gender": "girl", "Births": int("15058341")},
                {"state": "other", "gender": "boy", "Births": int("22044909")},
                {"state": "MI", "gender": "boy", "Births": int("1938321")},
                {"state": "IL", "gender": "boy", "Births": int("2357411")},
                {"state": "PA", "gender": "girl", "Births": int("1615383")},
                {"state": "OH", "gender": "girl", "Births": int("1622814")},
            ],
            "applied_filters": [],
            "rejected_filters": [],
        }
    ],
 }
 def test_pivot_table():
    form_data = {
        "adhoc_filters": [],
        "columns": ["state"],
        "datasource": "3__table",
        "date_format": "smart_date",
        "extra_form_data": {},
        "granularity_sqla": "ds",
        "groupby": ["gender"],
        "metrics": [
            {
                "aggregate": "SUM",
                "column": {"column_name": "num", "type": "BIGINT"},
                "expressionType": "SIMPLE",
                "label": "Births",
                "optionName": "metric_11",
            }
        ],
        "number_format": "SMART_NUMBER",
        "order_desc": True,
        "pandas_aggfunc": "sum",
        "pivot_margins": True,
        "row_limit": 50000,
        "slice_id": 143,
        "time_grain_sqla": "P1D",
        "time_range": "100 years ago : now",
        "time_range_endpoints": ["inclusive", "exclusive"],
        "url_params": {},
        "viz_type": "pivot_table",
    }
    result = copy.deepcopy(RESULT)
    assert pivot_table(result, form_data) == {
        "query_context": None,
        "queries": [
            {
                "cache_key": "1bd3ab8c01e98a0e349fb61bc76d9b90",
                "cached_dttm": None,
                "cache_timeout": 86400,
                "annotation_data": {},
                "error": None,
                "is_cached": None,
                "query": """SELECT state AS state,
       gender AS gender,
       sum(num) AS \"Births\"
 FROM birth_names
 WHERE ds >= TO_TIMESTAMP('1921-07-28 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US')
  AND ds < TO_TIMESTAMP('2021-07-28 10:39:44.000000', 'YYYY-MM-DD HH24:MI:SS.US')
 GROUP BY state,
         gender
 LIMIT 50000;
 """,
                "status": QueryStatus.SUCCESS,
                "stacktrace": None,
                "rowcount": 3,
                "colnames": [
                    "Births CA",
                    "Births FL",
                    "Births IL",
                    "Births MA",
                    "Births MI",
                    "Births NJ",
                    "Births NY",
                    "Births OH",
                    "Births PA",
                    "Births TX",
                    "Births other",
                    "Births All",
                ],
                "coltypes": [
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                ],
                "data": [
                    {
                        "Births CA": 5430796,
                        "Births FL": 1968060,
                        "Births IL": 2357411,
                        "Births MA": 1285126,
                        "Births MI": 1938321,
                        "Births NJ": 1486126,
                        "Births NY": 3543961,
                        "Births OH": 2376385,
                        "Births PA": 2390275,
                        "Births TX": 3311985,
                        "Births other": 22044909,
                        "Births All": 48133355,
                        "gender": "boy",
                    },
                    {
                        "Births CA": 3567754,
                        "Births FL": 1312593,
                        "Births IL": 1614427,
                        "Births MA": 842146,
                        "Births MI": 1326229,
                        "Births NJ": 992702,
                        "Births NY": 2280733,
                        "Births OH": 1622814,
                        "Births PA": 1615383,
                        "Births TX": 2313186,
                        "Births other": 15058341,
                        "Births All": 32546308,
                        "gender": "girl",
                    },
                    {
                        "Births CA": 8998550,
                        "Births FL": 3280653,
                        "Births IL": 3971838,
                        "Births MA": 2127272,
                        "Births MI": 3264550,
                        "Births NJ": 2478828,
                        "Births NY": 5824694,
                        "Births OH": 3999199,
                        "Births PA": 4005658,
                        "Births TX": 5625171,
                        "Births other": 37103250,
                        "Births All": 80679663,
                        "gender": "All",
                    },
                ],
                "applied_filters": [],
                "rejected_filters": [],
            }
        ],
    }
 def test_pivot_table_v2():
    form_data = {
        "adhoc_filters": [],
        "aggregateFunction": "Sum as Fraction of Rows",
        "colOrder": "key_a_to_z",
        "colTotals": True,
        "combineMetric": True,
        "datasource": "3__table",
        "date_format": "smart_date",
        "extra_form_data": {},
        "granularity_sqla": "ds",
        "groupbyColumns": ["state"],
        "groupbyRows": ["gender"],
        "metrics": [
            {
                "aggregate": "SUM",
                "column": {"column_name": "num", "type": "BIGINT"},
                "expressionType": "SIMPLE",
                "label": "Births",
                "optionName": "metric_11",
            }
        ],
        "metricsLayout": "ROWS",
        "rowOrder": "key_a_to_z",
        "rowTotals": True,
        "row_limit": 50000,
        "slice_id": 72,
        "time_grain_sqla": None,
        "time_range": "100 years ago : now",
        "time_range_endpoints": ["inclusive", "exclusive"],
        "transposePivot": True,
        "url_params": {},
        "valueFormat": "SMART_NUMBER",
        "viz_type": "pivot_table_v2",
    }
    result = copy.deepcopy(RESULT)
    assert pivot_table_v2(result, form_data) == {
        "query_context": None,
        "queries": [
            {
                "cache_key": "1bd3ab8c01e98a0e349fb61bc76d9b90",
                "cached_dttm": None,
                "cache_timeout": 86400,
                "annotation_data": {},
                "error": None,
                "is_cached": None,
                "query": """SELECT state AS state,
       gender AS gender,
       sum(num) AS \"Births\"
 FROM birth_names
 WHERE ds >= TO_TIMESTAMP('1921-07-28 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US')
  AND ds < TO_TIMESTAMP('2021-07-28 10:39:44.000000', 'YYYY-MM-DD HH24:MI:SS.US')
 GROUP BY state,
         gender
 LIMIT 50000;
 """,
                "status": QueryStatus.SUCCESS,
                "stacktrace": None,
                "rowcount": 12,
                "colnames": ["All Births", "boy Births", "girl Births"],
                "coltypes": [
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                    GenericDataType.NUMERIC,
                ],
                "data": [
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5965983645717509,
                        "girl Births": 0.40340163542824914,
                        "state": "All",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.6035190113962805,
                        "girl Births": 0.3964809886037195,
                        "state": "CA",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5998988615985903,
                        "girl Births": 0.4001011384014097,
                        "state": "FL",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5935315085862012,
                        "girl Births": 0.40646849141379887,
                        "state": "IL",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.6041192663655611,
                        "girl Births": 0.3958807336344389,
                        "state": "MA",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5937482960898133,
                        "girl Births": 0.4062517039101867,
                        "state": "MI",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5995276800165239,
                        "girl Births": 0.40047231998347604,
                        "state": "NJ",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.6084372844307357,
                        "girl Births": 0.39156271556926425,
                        "state": "NY",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5942152416021308,
                        "girl Births": 0.40578475839786915,
                        "state": "OH",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.596724682935987,
                        "girl Births": 0.40327531706401293,
                        "state": "PA",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5887794344385264,
                        "girl Births": 0.41122056556147357,
                        "state": "TX",
                    },
                    {
                        "All Births": 1.0,
                        "boy Births": 0.5941503507105172,
                        "girl Births": 0.40584964928948275,
                        "state": "other",
                    },
                ],
                "applied_filters": [],
                "rejected_filters": [],
            }
        ],
    }