[missing values] Removing replacing missing values (#4905)
This commit is contained in:
parent
30f88caf55
commit
61add606ca
|
|
@ -3729,4 +3729,5 @@
|
|||
"default": false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1351,7 +1351,7 @@ export const controls = {
|
|||
'mean',
|
||||
'min',
|
||||
'max',
|
||||
'stdev',
|
||||
'std',
|
||||
'var',
|
||||
]),
|
||||
default: 'sum',
|
||||
|
|
|
|||
|
|
@ -41,7 +41,6 @@ class QueryContext:
|
|||
to retrieve the data payload for a given viz.
|
||||
"""
|
||||
|
||||
default_fillna = 0
|
||||
cache_type = 'df'
|
||||
enforce_numerical_metrics = True
|
||||
|
||||
|
|
@ -103,7 +102,6 @@ class QueryContext:
|
|||
self.df_metrics_to_num(df, query_object)
|
||||
|
||||
df.replace([np.inf, -np.inf], np.nan)
|
||||
df = self.handle_nulls(df)
|
||||
return {
|
||||
'query': result.query,
|
||||
'status': result.status,
|
||||
|
|
@ -118,27 +116,6 @@ class QueryContext:
|
|||
if dtype.type == np.object_ and col in metrics:
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
|
||||
def handle_nulls(self, df):
|
||||
fillna = self.get_fillna_for_columns(df.columns)
|
||||
return df.fillna(fillna)
|
||||
|
||||
def get_fillna_for_col(self, col):
|
||||
"""Returns the value to use as filler for a specific Column.type"""
|
||||
if col and col.is_string:
|
||||
return ' NULL'
|
||||
return self.default_fillna
|
||||
|
||||
def get_fillna_for_columns(self, columns=None):
|
||||
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
|
||||
if columns is None:
|
||||
return self.default_fillna
|
||||
columns_dict = {col.column_name: col for col in self.datasource.columns}
|
||||
fillna = {
|
||||
c: self.get_fillna_for_col(columns_dict.get(c))
|
||||
for c in columns
|
||||
}
|
||||
return fillna
|
||||
|
||||
def get_data(self, df):
|
||||
return df.to_dict(orient='records')
|
||||
|
||||
|
|
|
|||
102
superset/viz.py
102
superset/viz.py
|
|
@ -75,7 +75,6 @@ class BaseViz(object):
|
|||
verbose_name = 'Base Viz'
|
||||
credits = ''
|
||||
is_timeseries = False
|
||||
default_fillna = 0
|
||||
cache_type = 'df'
|
||||
enforce_numerical_metrics = True
|
||||
|
||||
|
|
@ -164,28 +163,6 @@ class BaseViz(object):
|
|||
"""
|
||||
pass
|
||||
|
||||
def handle_nulls(self, df):
|
||||
fillna = self.get_fillna_for_columns(df.columns)
|
||||
return df.fillna(fillna)
|
||||
|
||||
def get_fillna_for_col(self, col):
|
||||
"""Returns the value to use as filler for a specific Column.type"""
|
||||
if col:
|
||||
if col.is_string:
|
||||
return ' NULL'
|
||||
return self.default_fillna
|
||||
|
||||
def get_fillna_for_columns(self, columns=None):
|
||||
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
|
||||
if columns is None:
|
||||
return self.default_fillna
|
||||
columns_dict = {col.column_name: col for col in self.datasource.columns}
|
||||
fillna = {
|
||||
c: self.get_fillna_for_col(columns_dict.get(c))
|
||||
for c in columns
|
||||
}
|
||||
return fillna
|
||||
|
||||
def get_samples(self):
|
||||
query_obj = self.query_obj()
|
||||
query_obj.update({
|
||||
|
|
@ -254,8 +231,7 @@ class BaseViz(object):
|
|||
if self.enforce_numerical_metrics:
|
||||
self.df_metrics_to_num(df)
|
||||
|
||||
df.replace([np.inf, -np.inf], np.nan)
|
||||
df = self.handle_nulls(df)
|
||||
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
return df
|
||||
|
||||
def df_metrics_to_num(self, df):
|
||||
|
|
@ -653,7 +629,9 @@ class TimeTableViz(BaseViz):
|
|||
pt = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
columns=columns,
|
||||
values=values)
|
||||
values=values,
|
||||
dropna=False,
|
||||
)
|
||||
pt.index = pt.index.map(str)
|
||||
pt = pt.sort_index()
|
||||
return dict(
|
||||
|
|
@ -696,12 +674,20 @@ class PivotTableViz(BaseViz):
|
|||
self.form_data.get('granularity') == 'all' and
|
||||
DTTM_ALIAS in df):
|
||||
del df[DTTM_ALIAS]
|
||||
|
||||
aggfunc = self.form_data.get('pandas_aggfunc')
|
||||
|
||||
# Ensure that Pandas's sum function mimics that of SQL.
|
||||
if aggfunc == 'sum':
|
||||
aggfunc = lambda x: x.sum(min_count=1) # noqa: E731
|
||||
|
||||
df = df.pivot_table(
|
||||
index=self.form_data.get('groupby'),
|
||||
columns=self.form_data.get('columns'),
|
||||
values=[utils.get_metric_name(m) for m in self.form_data.get('metrics')],
|
||||
aggfunc=self.form_data.get('pandas_aggfunc'),
|
||||
aggfunc=aggfunc,
|
||||
margins=self.form_data.get('pivot_margins'),
|
||||
dropna=False,
|
||||
)
|
||||
# Display metrics side by side with each column
|
||||
if self.form_data.get('combine_metric'):
|
||||
|
|
@ -709,7 +695,7 @@ class PivotTableViz(BaseViz):
|
|||
return dict(
|
||||
columns=list(df.columns),
|
||||
html=df.to_html(
|
||||
na_rep='',
|
||||
na_rep='null',
|
||||
classes=(
|
||||
'dataframe table table-striped table-bordered '
|
||||
'table-condensed table-hover').split(' ')),
|
||||
|
|
@ -877,7 +863,7 @@ class BoxPlotViz(NVD3Viz):
|
|||
index_value = label_sep.join(index_value)
|
||||
boxes = defaultdict(dict)
|
||||
for (label, key), value in row.items():
|
||||
if key == 'median':
|
||||
if key == 'nanmedian':
|
||||
key = 'Q2'
|
||||
boxes[label][key] = value
|
||||
for label, box in boxes.items():
|
||||
|
|
@ -894,28 +880,24 @@ class BoxPlotViz(NVD3Viz):
|
|||
|
||||
def get_data(self, df):
|
||||
form_data = self.form_data
|
||||
df = df.fillna(0)
|
||||
|
||||
# conform to NVD3 names
|
||||
def Q1(series): # need to be named functions - can't use lambdas
|
||||
return np.percentile(series, 25)
|
||||
return np.nanpercentile(series, 25)
|
||||
|
||||
def Q3(series):
|
||||
return np.percentile(series, 75)
|
||||
return np.nanpercentile(series, 75)
|
||||
|
||||
whisker_type = form_data.get('whisker_options')
|
||||
if whisker_type == 'Tukey':
|
||||
|
||||
def whisker_high(series):
|
||||
upper_outer_lim = Q3(series) + 1.5 * (Q3(series) - Q1(series))
|
||||
series = series[series <= upper_outer_lim]
|
||||
return series[np.abs(series - upper_outer_lim).argmin()]
|
||||
return series[series <= upper_outer_lim].max()
|
||||
|
||||
def whisker_low(series):
|
||||
lower_outer_lim = Q1(series) - 1.5 * (Q3(series) - Q1(series))
|
||||
# find the closest value above the lower outer limit
|
||||
series = series[series >= lower_outer_lim]
|
||||
return series[np.abs(series - lower_outer_lim).argmin()]
|
||||
return series[series >= lower_outer_lim].min()
|
||||
|
||||
elif whisker_type == 'Min/max (no outliers)':
|
||||
|
||||
|
|
@ -929,10 +911,10 @@ class BoxPlotViz(NVD3Viz):
|
|||
low, high = whisker_type.replace(' percentiles', '').split('/')
|
||||
|
||||
def whisker_high(series):
|
||||
return np.percentile(series, int(high))
|
||||
return np.nanpercentile(series, int(high))
|
||||
|
||||
def whisker_low(series):
|
||||
return np.percentile(series, int(low))
|
||||
return np.nanpercentile(series, int(low))
|
||||
|
||||
else:
|
||||
raise ValueError('Unknown whisker type: {}'.format(whisker_type))
|
||||
|
|
@ -943,7 +925,7 @@ class BoxPlotViz(NVD3Viz):
|
|||
# pandas sometimes doesn't like getting lists back here
|
||||
return set(above.tolist() + below.tolist())
|
||||
|
||||
aggregate = [Q1, np.median, Q3, whisker_high, whisker_low, outliers]
|
||||
aggregate = [Q1, np.nanmedian, Q3, whisker_high, whisker_low, outliers]
|
||||
df = df.groupby(form_data.get('groupby')).agg(aggregate)
|
||||
chart_data = self.to_series(df)
|
||||
return chart_data
|
||||
|
|
@ -1034,7 +1016,6 @@ class BulletViz(NVD3Viz):
|
|||
return d
|
||||
|
||||
def get_data(self, df):
|
||||
df = df.fillna(0)
|
||||
df['metric'] = df[[utils.get_metric_name(self.metric)]]
|
||||
values = df['metric'].values
|
||||
return {
|
||||
|
|
@ -1152,7 +1133,6 @@ class NVD3TimeSeriesViz(NVD3Viz):
|
|||
|
||||
def process_data(self, df, aggregate=False):
|
||||
fd = self.form_data
|
||||
df = df.fillna(0)
|
||||
if fd.get('granularity') == 'all':
|
||||
raise Exception(_('Pick a time granularity for your time series'))
|
||||
|
||||
|
|
@ -1160,14 +1140,18 @@ class NVD3TimeSeriesViz(NVD3Viz):
|
|||
df = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
columns=fd.get('groupby'),
|
||||
values=self.metric_labels)
|
||||
values=self.metric_labels,
|
||||
dropna=False,
|
||||
)
|
||||
else:
|
||||
df = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
columns=fd.get('groupby'),
|
||||
values=self.metric_labels,
|
||||
fill_value=0,
|
||||
aggfunc=sum)
|
||||
aggfunc=sum,
|
||||
dropna=False,
|
||||
)
|
||||
|
||||
fm = fd.get('resample_fillmethod')
|
||||
if not fm:
|
||||
|
|
@ -1176,8 +1160,6 @@ class NVD3TimeSeriesViz(NVD3Viz):
|
|||
rule = fd.get('resample_rule')
|
||||
if how and rule:
|
||||
df = df.resample(rule, how=how, fill_method=fm)
|
||||
if not fm:
|
||||
df = df.fillna(0)
|
||||
|
||||
if self.sort_series:
|
||||
dfs = df.sum()
|
||||
|
|
@ -1241,7 +1223,6 @@ class NVD3TimeSeriesViz(NVD3Viz):
|
|||
fd = self.form_data
|
||||
comparison_type = fd.get('comparison_type') or 'values'
|
||||
df = self.process_data(df)
|
||||
|
||||
if comparison_type == 'values':
|
||||
chart_data = self.to_series(df)
|
||||
for i, (label, df2) in enumerate(self._extra_chart_data):
|
||||
|
|
@ -1368,7 +1349,6 @@ class NVD3DualLineViz(NVD3Viz):
|
|||
|
||||
def get_data(self, df):
|
||||
fd = self.form_data
|
||||
df = df.fillna(0)
|
||||
|
||||
if self.form_data.get('granularity') == 'all':
|
||||
raise Exception(_('Pick a time granularity for your time series'))
|
||||
|
|
@ -1377,7 +1357,9 @@ class NVD3DualLineViz(NVD3Viz):
|
|||
metric_2 = utils.get_metric_name(fd.get('metric_2'))
|
||||
df = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
values=[metric, metric_2])
|
||||
values=[metric, metric_2],
|
||||
dropna=False,
|
||||
)
|
||||
|
||||
chart_data = self.to_series(df)
|
||||
return chart_data
|
||||
|
|
@ -1425,7 +1407,9 @@ class NVD3TimePivotViz(NVD3TimeSeriesViz):
|
|||
df = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
columns='series',
|
||||
values=utils.get_metric_name(fd.get('metric')))
|
||||
values=utils.get_metric_name(fd.get('metric')),
|
||||
dropna=False,
|
||||
)
|
||||
chart_data = self.to_series(df)
|
||||
for serie in chart_data:
|
||||
serie['rank'] = rank_lookup[serie['key']]
|
||||
|
|
@ -1462,7 +1446,9 @@ class DistributionPieViz(NVD3Viz):
|
|||
metric = self.metric_labels[0]
|
||||
df = df.pivot_table(
|
||||
index=self.groupby,
|
||||
values=[metric])
|
||||
values=[metric],
|
||||
dropna=False,
|
||||
)
|
||||
df.sort_values(by=metric, ascending=False, inplace=True)
|
||||
df = df.reset_index()
|
||||
df.columns = ['x', 'y']
|
||||
|
|
@ -1549,9 +1535,10 @@ class DistributionBarViz(DistributionPieViz):
|
|||
pt = df.pivot_table(
|
||||
index=self.groupby,
|
||||
columns=columns,
|
||||
values=metrics)
|
||||
values=metrics,
|
||||
dropna=False,
|
||||
)
|
||||
if fd.get('contribution'):
|
||||
pt = pt.fillna(0)
|
||||
pt = pt.T
|
||||
pt = (pt / pt.sum()).T
|
||||
pt = pt.reindex(row.index)
|
||||
|
|
@ -2117,9 +2104,6 @@ class BaseDeckGLViz(BaseViz):
|
|||
credits = '<a href="https://uber.github.io/deck.gl/">deck.gl</a>'
|
||||
spatial_control_keys = []
|
||||
|
||||
def handle_nulls(self, df):
|
||||
return df
|
||||
|
||||
def get_metrics(self):
|
||||
self.metric = self.form_data.get('size')
|
||||
return [self.metric] if self.metric else []
|
||||
|
|
@ -2572,11 +2556,11 @@ class PairedTTestViz(BaseViz):
|
|||
fd = self.form_data
|
||||
groups = fd.get('groupby')
|
||||
metrics = fd.get('metrics')
|
||||
df.fillna(0)
|
||||
df = df.pivot_table(
|
||||
index=DTTM_ALIAS,
|
||||
columns=groups,
|
||||
values=metrics)
|
||||
values=metrics,
|
||||
)
|
||||
cols = []
|
||||
# Be rid of falsey keys
|
||||
for col in df.columns:
|
||||
|
|
@ -2699,7 +2683,7 @@ class PartitionViz(NVD3TimeSeriesViz):
|
|||
for i in range(0, len(groups) + 1):
|
||||
self.form_data['groupby'] = groups[:i]
|
||||
df_drop = df.drop(groups[i:], 1)
|
||||
procs[i] = self.process_data(df_drop, aggregate=True).fillna(0)
|
||||
procs[i] = self.process_data(df_drop, aggregate=True)
|
||||
self.form_data['groupby'] = groups
|
||||
return procs
|
||||
|
||||
|
|
|
|||
|
|
@ -622,16 +622,6 @@ class CoreTests(SupersetTestCase):
|
|||
assert 'language' in resp
|
||||
self.logout()
|
||||
|
||||
def test_viz_get_fillna_for_columns(self):
|
||||
slc = self.get_slice('Girls', db.session)
|
||||
q = slc.viz.query_obj()
|
||||
results = slc.viz.datasource.query(q)
|
||||
fillna_columns = slc.viz.get_fillna_for_columns(results.df.columns)
|
||||
self.assertDictEqual(
|
||||
fillna_columns,
|
||||
{'name': ' NULL', 'sum__num': 0},
|
||||
)
|
||||
|
||||
def test_import_csv(self):
|
||||
self.login(username='admin')
|
||||
filename = 'testCSV.csv'
|
||||
|
|
|
|||
|
|
@ -84,18 +84,6 @@ class BaseVizTestCase(SupersetTestCase):
|
|||
self.assertEqual(test_viz.metric_labels, expect_metric_labels)
|
||||
self.assertEqual(test_viz.all_metrics, expect_metric_labels)
|
||||
|
||||
def test_get_fillna_returns_default_on_null_columns(self):
|
||||
form_data = {
|
||||
'viz_type': 'table',
|
||||
'token': '12345',
|
||||
}
|
||||
datasource = self.get_datasource_mock()
|
||||
test_viz = viz.BaseViz(datasource, form_data)
|
||||
self.assertEqual(
|
||||
test_viz.default_fillna,
|
||||
test_viz.get_fillna_for_columns(),
|
||||
)
|
||||
|
||||
def test_get_df_returns_empty_df(self):
|
||||
form_data = {'dummy': 123}
|
||||
query_obj = {'granularity': 'day'}
|
||||
|
|
|
|||
Loading…
Reference in New Issue