deprecate groupby controls in query_obj (#9366)

* Deprecate groupby from query_obj

* Fix query_object bug

* Fix histogram

* Remove groupby from legacy druid connector and fix first batch of unit tests

* Deprecate some unnecessary tests + fix a few others

* Address comments

* hide SIP-38 changes behind feature flag

* Break out further SIP-38 related tests

* Reslove test errors

* Add feature flag to QueryContext

* Resolve tests and bad rebase

* Backport recent changes from viz.py and fix broken DeckGL charts

* Fix bad rebase

* backport #9522 and address comments
This commit is contained in:
Ville Brofeldt 2020-04-14 12:06:10 +03:00 committed by GitHub
parent c1f8c9e397
commit 9447381549
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 4110 additions and 41 deletions

View File

@ -16,6 +16,7 @@
# under the License.
# pylint: disable=R
import hashlib
import logging
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Union
@ -23,11 +24,13 @@ import simplejson as json
from flask_babel import gettext as _
from pandas import DataFrame
from superset import app
from superset import app, is_feature_enabled
from superset.exceptions import QueryObjectValidationError
from superset.utils import core as utils, pandas_postprocessing
from superset.views.utils import get_time_range_endpoints
logger = logging.getLogger(__name__)
# TODO: Type Metrics dictionary with TypedDict when it becomes a vanilla python type
# https://github.com/python/mypy/issues/5288
@ -75,6 +78,7 @@ class QueryObject:
relative_start: str = app.config["DEFAULT_RELATIVE_START_TIME"],
relative_end: str = app.config["DEFAULT_RELATIVE_END_TIME"],
):
is_sip_38 = is_feature_enabled("SIP_38_VIZ_REARCHITECTURE")
self.granularity = granularity
self.from_dttm, self.to_dttm = utils.get_since_until(
relative_start=relative_start,
@ -85,8 +89,9 @@ class QueryObject:
self.is_timeseries = is_timeseries
self.time_range = time_range
self.time_shift = utils.parse_human_timedelta(time_shift)
self.groupby = groupby or []
self.post_processing = post_processing or []
if not is_sip_38:
self.groupby = groupby or []
# Temporary solution for backward compatibility issue due the new format of
# non-ad-hoc metric which needs to adhere to superset-ui per
@ -107,6 +112,13 @@ class QueryObject:
self.extras["time_range_endpoints"] = get_time_range_endpoints(form_data={})
self.columns = columns or []
if is_sip_38 and groupby:
self.columns += groupby
logger.warning(
f"The field groupby is deprecated. Viz plugins should "
f"pass all selectables via the columns field"
)
self.orderby = orderby or []
def to_dict(self) -> Dict[str, Any]:
@ -115,7 +127,6 @@ class QueryObject:
"from_dttm": self.from_dttm,
"to_dttm": self.to_dttm,
"is_timeseries": self.is_timeseries,
"groupby": self.groupby,
"metrics": self.metrics,
"row_limit": self.row_limit,
"filter": self.filter,
@ -126,6 +137,9 @@ class QueryObject:
"columns": self.columns,
"orderby": self.orderby,
}
if not is_feature_enabled("SIP_38_VIZ_REARCHITECTURE"):
query_object_dict["groupby"] = self.groupby
return query_object_dict
def cache_key(self, **extra: Any) -> str:

View File

@ -287,6 +287,7 @@ DEFAULT_FEATURE_FLAGS = {
"PRESTO_EXPAND_DATA": False,
"REDUCE_DASHBOARD_BOOTSTRAP_PAYLOAD": False,
"SHARE_QUERIES_VIA_KV_STORE": False,
"SIP_38_VIZ_REARCHITECTURE": False,
"TAGGING_SYSTEM": False,
"SQLLAB_BACKEND_PERSISTENCE": False,
"LIST_VIEWS_NEW_UI": False,

View File

@ -48,7 +48,7 @@ from sqlalchemy import (
from sqlalchemy.orm import backref, relationship, Session
from sqlalchemy_utils import EncryptedType
from superset import conf, db, security_manager
from superset import conf, db, is_feature_enabled, security_manager
from superset.connectors.base.models import BaseColumn, BaseDatasource, BaseMetric
from superset.constants import NULL_STRING
from superset.exceptions import SupersetException
@ -84,6 +84,7 @@ try:
except ImportError:
pass
IS_SIP_38 = is_feature_enabled("SIP_38_VIZ_REARCHITECTURE")
DRUID_TZ = conf.get("DRUID_TZ")
POST_AGG_TYPE = "postagg"
metadata = Model.metadata # pylint: disable=no-member
@ -1082,11 +1083,11 @@ class DruidDatasource(Model, BaseDatasource):
return aggregations
def get_dimensions(
self, groupby: List[str], columns_dict: Dict[str, DruidColumn]
self, columns: List[str], columns_dict: Dict[str, DruidColumn]
) -> List[Union[str, Dict]]:
dimensions = []
groupby = [gb for gb in groupby if gb in columns_dict]
for column_name in groupby:
columns = [col for col in columns if col in columns_dict]
for column_name in columns:
col = columns_dict.get(column_name)
dim_spec = col.dimension_spec if col else None
dimensions.append(dim_spec or column_name)
@ -1137,11 +1138,12 @@ class DruidDatasource(Model, BaseDatasource):
def run_query( # druid
self,
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
columns=None,
groupby=None,
filter=None,
is_timeseries=True,
timeseries_limit=None,
@ -1151,7 +1153,6 @@ class DruidDatasource(Model, BaseDatasource):
inner_to_dttm=None,
orderby=None,
extras=None,
columns=None,
phase=2,
client=None,
order_desc=True,
@ -1188,7 +1189,11 @@ class DruidDatasource(Model, BaseDatasource):
)
# the dimensions list with dimensionSpecs expanded
dimensions = self.get_dimensions(groupby, columns_dict)
dimensions = self.get_dimensions(
columns if IS_SIP_38 else groupby, columns_dict
)
extras = extras or {}
qry = dict(
datasource=self.datasource_name,
@ -1214,7 +1219,9 @@ class DruidDatasource(Model, BaseDatasource):
order_direction = "descending" if order_desc else "ascending"
if columns:
if (IS_SIP_38 and not metrics and "__time" not in columns) or (
not IS_SIP_38 and columns
):
columns.append("__time")
del qry["post_aggregations"]
del qry["aggregations"]
@ -1224,11 +1231,20 @@ class DruidDatasource(Model, BaseDatasource):
qry["granularity"] = "all"
qry["limit"] = row_limit
client.scan(**qry)
elif len(groupby) == 0 and not having_filters:
elif (IS_SIP_38 and columns) or (
not IS_SIP_38 and len(groupby) == 0 and not having_filters
):
logger.info("Running timeseries query for no groupby values")
del qry["dimensions"]
client.timeseries(**qry)
elif not having_filters and len(groupby) == 1 and order_desc:
elif (
not having_filters
and order_desc
and (
(IS_SIP_38 and len(columns) == 1)
or (not IS_SIP_38 and len(groupby) == 1)
)
):
dim = list(qry["dimensions"])[0]
logger.info("Running two-phase topn query for dimension [{}]".format(dim))
pre_qry = deepcopy(qry)
@ -1279,7 +1295,10 @@ class DruidDatasource(Model, BaseDatasource):
qry["metric"] = list(qry["aggregations"].keys())[0]
client.topn(**qry)
logger.info("Phase 2 Complete")
elif len(groupby) > 0 or having_filters:
elif (
having_filters
or ((IS_SIP_38 and columns) or (not IS_SIP_38 and len(groupby))) > 0
):
# If grouping on multiple fields or using a having filter
# we have to force a groupby query
logger.info("Running groupby query for dimensions [{}]".format(dimensions))
@ -1364,8 +1383,8 @@ class DruidDatasource(Model, BaseDatasource):
return query_str
@staticmethod
def homogenize_types(df: pd.DataFrame, groupby_cols: Iterable[str]) -> pd.DataFrame:
"""Converting all GROUPBY columns to strings
def homogenize_types(df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
"""Converting all columns to strings
When grouping by a numeric (say FLOAT) column, pydruid returns
strings in the dataframe. This creates issues downstream related
@ -1374,7 +1393,7 @@ class DruidDatasource(Model, BaseDatasource):
Here we replace None with <NULL> and make the whole series a
str instead of an object.
"""
df[groupby_cols] = df[groupby_cols].fillna(NULL_STRING).astype("unicode")
df[columns] = df[columns].fillna(NULL_STRING).astype("unicode")
return df
def query(self, query_obj: Dict) -> QueryResult:
@ -1390,7 +1409,9 @@ class DruidDatasource(Model, BaseDatasource):
df=df, query=query_str, duration=datetime.now() - qry_start_dttm
)
df = self.homogenize_types(df, query_obj.get("groupby", []))
df = self.homogenize_types(
df, query_obj.get("columns" if IS_SIP_38 else "groupby", [])
)
df.columns = [
DTTM_ALIAS if c in ("timestamp", "__time") else c for c in df.columns
]
@ -1405,7 +1426,9 @@ class DruidDatasource(Model, BaseDatasource):
cols: List[str] = []
if DTTM_ALIAS in df.columns:
cols += [DTTM_ALIAS]
cols += query_obj.get("groupby") or []
if not IS_SIP_38:
cols += query_obj.get("groupby") or []
cols += query_obj.get("columns") or []
cols += query_obj.get("metrics") or []

View File

@ -49,7 +49,7 @@ from sqlalchemy.schema import UniqueConstraint
from sqlalchemy.sql import column, ColumnElement, literal_column, table, text
from sqlalchemy.sql.expression import Label, Select, TextAsFrom
from superset import app, db, security_manager
from superset import app, db, is_feature_enabled, security_manager
from superset.connectors.base.models import BaseColumn, BaseDatasource, BaseMetric
from superset.constants import NULL_STRING
from superset.db_engine_specs.base import TimestampExpression
@ -696,11 +696,12 @@ class SqlaTable(Model, BaseDatasource):
def get_sqla_query( # sqla
self,
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
columns=None,
groupby=None,
filter=None,
is_timeseries=True,
timeseries_limit=15,
@ -710,7 +711,6 @@ class SqlaTable(Model, BaseDatasource):
inner_to_dttm=None,
orderby=None,
extras=None,
columns=None,
order_desc=True,
) -> SqlaQuery:
"""Querying any sqla table from this common interface"""
@ -723,6 +723,7 @@ class SqlaTable(Model, BaseDatasource):
"filter": filter,
"columns": {col.column_name: col for col in self.columns},
}
is_sip_38 = is_feature_enabled("SIP_38_VIZ_REARCHITECTURE")
template_kwargs.update(self.template_params_dict)
extra_cache_keys: List[Any] = []
template_kwargs["extra_cache_keys"] = extra_cache_keys
@ -749,7 +750,11 @@ class SqlaTable(Model, BaseDatasource):
"and is required by this type of chart"
)
)
if not groupby and not metrics and not columns:
if (
not metrics
and not columns
and (is_sip_38 or (not is_sip_38 and not groupby))
):
raise Exception(_("Empty query?"))
metrics_exprs: List[ColumnElement] = []
for m in metrics:
@ -768,9 +773,9 @@ class SqlaTable(Model, BaseDatasource):
select_exprs: List[Column] = []
groupby_exprs_sans_timestamp: OrderedDict = OrderedDict()
if groupby:
if (is_sip_38 and metrics and columns) or (not is_sip_38 and groupby):
# dedup columns while preserving order
groupby = list(dict.fromkeys(groupby))
groupby = list(dict.fromkeys(columns if is_sip_38 else groupby))
select_exprs = []
for s in groupby:
@ -829,7 +834,7 @@ class SqlaTable(Model, BaseDatasource):
tbl = self.get_from_clause(template_processor)
if not columns:
if (is_sip_38 and metrics) or (not is_sip_38 and not columns):
qry = qry.group_by(*groupby_exprs_with_timestamp.values())
where_clause_and = []
@ -892,7 +897,7 @@ class SqlaTable(Model, BaseDatasource):
qry = qry.where(and_(*where_clause_and))
qry = qry.having(and_(*having_clause_and))
if not orderby and not columns:
if not orderby and ((is_sip_38 and metrics) or (not is_sip_38 and not columns)):
orderby = [(main_metric_expr, not order_desc)]
# To ensure correct handling of the ORDER BY labeling we need to reference the
@ -914,7 +919,12 @@ class SqlaTable(Model, BaseDatasource):
if row_limit:
qry = qry.limit(row_limit)
if is_timeseries and timeseries_limit and groupby and not time_groupby_inline:
if (
is_timeseries
and timeseries_limit
and not time_groupby_inline
and ((is_sip_38 and columns) or (not is_sip_38 and groupby))
):
if self.database.db_engine_spec.allows_joins:
# some sql dialects require for order by expressions
# to also be in the select clause -- others, e.g. vertica,
@ -972,7 +982,6 @@ class SqlaTable(Model, BaseDatasource):
prequery_obj = {
"is_timeseries": False,
"row_limit": timeseries_limit,
"groupby": groupby,
"metrics": metrics,
"granularity": granularity,
"from_dttm": inner_from_dttm or from_dttm,
@ -983,6 +992,9 @@ class SqlaTable(Model, BaseDatasource):
"columns": columns,
"order_desc": True,
}
if not is_sip_38:
prequery_obj["groupby"] = groupby
result = self.query(prequery_obj)
prequeries.append(result.query)
dimensions = [

View File

@ -31,7 +31,11 @@ from superset.legacy import update_time_range
from superset.models.helpers import AuditMixinNullable, ImportMixin
from superset.models.tags import ChartUpdater
from superset.utils import core as utils
from superset.viz import BaseViz, viz_types
if is_feature_enabled("SIP_38_VIZ_REARCHITECTURE"):
from superset.viz_sip38 import BaseViz, viz_types # type: ignore
else:
from superset.viz import BaseViz, viz_types # type: ignore
if TYPE_CHECKING:
# pylint: disable=unused-import

View File

@ -23,7 +23,7 @@ import simplejson as json
from flask import request
import superset.models.core as models
from superset import app, db, viz
from superset import app, db, is_feature_enabled
from superset.connectors.connector_registry import ConnectorRegistry
from superset.exceptions import SupersetException
from superset.legacy import update_time_range
@ -31,6 +31,12 @@ from superset.models.dashboard import Dashboard
from superset.models.slice import Slice
from superset.utils.core import QueryStatus, TimeRangeEndpoint
if is_feature_enabled("SIP_38_VIZ_REARCHITECTURE"):
from superset import viz_sip38 as viz # type: ignore
else:
from superset import viz # type: ignore
FORM_DATA_KEY_BLACKLIST: List[str] = []
if not app.config["ENABLE_JAVASCRIPT_CONTROLS"]:
FORM_DATA_KEY_BLACKLIST = ["js_tooltip", "js_onclick_href", "js_data_mutator"]

2852
superset/viz_sip38.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -415,11 +415,11 @@ class DruidFuncTestCase(SupersetTestCase):
client.query_builder.last_query.query_dict = {"mock": 0}
# no groupby calls client.timeseries
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
client=client,
filter=[],
row_limit=100,
@ -472,11 +472,11 @@ class DruidFuncTestCase(SupersetTestCase):
client.query_builder.last_query.query_dict = {"mock": 0}
# no groupby calls client.timeseries
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
client=client,
filter=[],
row_limit=100,
@ -519,11 +519,11 @@ class DruidFuncTestCase(SupersetTestCase):
client.query_builder.last_query.query_dict = {"mock": 0}
# client.topn is called twice
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
timeseries_limit=100,
client=client,
order_desc=True,
@ -543,11 +543,11 @@ class DruidFuncTestCase(SupersetTestCase):
client = Mock()
client.query_builder.last_query.query_dict = {"mock": 0}
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
client=client,
order_desc=False,
filter=[],
@ -568,11 +568,11 @@ class DruidFuncTestCase(SupersetTestCase):
client = Mock()
client.query_builder.last_query.query_dict = {"mock": 0}
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
client=client,
order_desc=True,
timeseries_limit=5,
@ -619,11 +619,11 @@ class DruidFuncTestCase(SupersetTestCase):
client.query_builder.last_query.query_dict = {"mock": 0}
# no groupby calls client.timeseries
ds.run_query(
groupby,
metrics,
None,
from_dttm,
to_dttm,
groupby=groupby,
client=client,
row_limit=100,
filter=[],
@ -1021,11 +1021,11 @@ class DruidFuncTestCase(SupersetTestCase):
granularity = "all"
# get the counts of the top 5 'dim1's, order by 'sum1'
ds.run_query(
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
groupby=groupby,
timeseries_limit=5,
timeseries_limit_metric="sum1",
client=client,
@ -1042,11 +1042,11 @@ class DruidFuncTestCase(SupersetTestCase):
# get the counts of the top 5 'dim1's, order by 'div1'
ds.run_query(
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
groupby=groupby,
timeseries_limit=5,
timeseries_limit_metric="div1",
client=client,
@ -1064,11 +1064,11 @@ class DruidFuncTestCase(SupersetTestCase):
groupby = ["dim1", "dim2"]
# get the counts of the top 5 ['dim1', 'dim2']s, order by 'sum1'
ds.run_query(
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
groupby=groupby,
timeseries_limit=5,
timeseries_limit_metric="sum1",
client=client,
@ -1085,11 +1085,11 @@ class DruidFuncTestCase(SupersetTestCase):
# get the counts of the top 5 ['dim1', 'dim2']s, order by 'div1'
ds.run_query(
groupby,
metrics,
granularity,
from_dttm,
to_dttm,
groupby=groupby,
timeseries_limit=5,
timeseries_limit_metric="div1",
client=client,

File diff suppressed because it is too large Load Diff