[sqllab] assign types for visualize flow (#2458)

* [sqllab] assign types for visualize flow

Somehow when using the visualize flow, the types were not
assigned at all, creating some bugs downstream. This PR attempts to get
the information required based on what pandas is knows and the types in
the data itself.

* Fixing tests

* Fixing tests

* Fixing more tests

* Fixing the last py3 tests
This commit is contained in:
Maxime Beauchemin 2017-03-24 09:23:51 -07:00 committed by GitHub
parent 7bf19b1232
commit 1f8e48b374
4 changed files with 135 additions and 84 deletions

View File

@ -51,6 +51,7 @@ setup(
'flask-script==2.0.5',
'flask-sqlalchemy==2.0',
'flask-testing==0.6.1',
'future>=0.16.0, <0.17',
'humanize==0.5.1',
'gunicorn==19.6.0',
'markdown==2.6.8',

View File

@ -10,6 +10,9 @@ from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from datetime import datetime, date
from past.builtins import basestring
import pandas as pd
import numpy as np
@ -19,6 +22,22 @@ INFER_COL_TYPES_SAMPLE_SIZE = 100
class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
'b': 'BOOL', # boolean
'i': 'INT', # (signed) integer
'u': 'INT', # unsigned integer
'l': 'INT', # 64bit integer
'f': 'FLOAT', # floating-point
'c': 'FLOAT', # complex-floating point
'm': None, # timedelta
'M': 'DATETIME', # datetime
'O': 'OBJECT', # (Python) objects
'S': 'BYTE', # (byte-)string
'U': 'STRING', # Unicode
'V': None, # raw data (void)
}
def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)
@ -30,6 +49,47 @@ class SupersetDataFrame(object):
def data(self):
return self.__df.to_dict(orient='records')
@classmethod
def db_type(cls, dtype):
"""Given a numpy dtype, Returns a generic database type"""
return cls.type_map.get(dtype.char)
@classmethod
def datetime_conversion_rate(cls, data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total
@classmethod
def is_date(cls, dtype):
if dtype.name:
return dtype.name.startswith('datetime')
@classmethod
def is_dimension(cls, dtype, column_name):
if cls.is_id(column_name):
return False
return dtype.name in ('object', 'bool')
@classmethod
def is_id(cls, column_name):
return column_name.startswith('id') or column_name.endswith('id')
@classmethod
def agg_func(cls, dtype, column_name):
# consider checking for key substring too.
if cls.is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
@property
def columns(self):
"""Provides metadata about columns for data visualization.
@ -45,22 +105,33 @@ class SupersetDataFrame(object):
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
col_db_type = self.db_type(self.__df.dtypes[col])
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
'agg': self.agg_func(self.__df.dtypes[col], col),
'type': col_db_type,
'is_date': self.is_date(self.__df.dtypes[col]),
'is_dim': self.is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg
if column['type'] == 'object':
if column['type'] in ('OBJECT', None):
v = sample[col].iloc[0] if not sample[col].empty else None
if isinstance(v, basestring):
column['type'] = 'STRING'
elif isinstance(v, int):
column['type'] = 'INT'
elif isinstance(v, float):
column['type'] = 'FLOAT'
elif isinstance(v, (datetime, date)):
column['type'] = 'DATETIME'
column['is_date'] = True
column['is_dim'] = False
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
if (
column['type'] == 'STRING' and
self.datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
@ -70,42 +141,3 @@ class SupersetDataFrame(object):
column.pop('agg', None)
columns.append(column)
return columns
# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total
def is_date(dtype):
if dtype.name:
return dtype.name.startswith('datetime')
def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype.name in ('object', 'bool')
def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')
def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None

View File

@ -1781,6 +1781,7 @@ class Superset(BaseSupersetView):
filterable=is_dim,
groupby=is_dim,
is_dttm=config.get('is_date', False),
type=config.get('type', False),
)
cols.append(col)
if is_dim:

View File

@ -9,6 +9,7 @@ import os
import subprocess
import time
import unittest
from past.builtins import basestring
import pandas as pd
@ -238,49 +239,65 @@ class CeleryTestCase(SupersetTestCase):
self.assertEqual(True, query.select_as_cta)
self.assertEqual(True, query.select_as_cta_used)
@staticmethod
def de_unicode_dict(d):
def str_if_basestring(o):
if isinstance(o, basestring):
return str(o)
return o
return {str_if_basestring(k): str_if_basestring(d[k]) for k in d}
@classmethod
def dictify_list_of_dicts(cls, l, k):
return {str(o[k]): cls.de_unicode_dict(o) for o in l}
def test_get_columns(self):
main_db = self.get_main_database(db.session)
df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
cdf = dataframe.SupersetDataFrame(df)
# Making ordering non-deterministic
cols = self.dictify_list_of_dicts(cdf.columns, 'name')
if main_db.sqlalchemy_uri.startswith('sqlite'):
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns
self.assertEqual(self.dictify_list_of_dicts([
{'is_date': True, 'type': 'STRING', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'ds2',
'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'STRING', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string3', 'is_dim': True}], 'name')
, cols
)
else:
self.assertEqual(
[{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'datetime64[ns]',
'name': 'ds2', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'int64',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'datetime_string', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'datetime_string', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'object',
'name': 'string3', 'is_dim': True}]
, cdf.columns
self.assertEqual(self.dictify_list_of_dicts([
{'is_date': True, 'type': 'DATETIME', 'name': 'ds',
'is_dim': False},
{'is_date': True, 'type': 'DATETIME',
'name': 'ds2', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_ms', 'is_dim': False},
{'agg': 'sum', 'is_date': False, 'type': 'INT',
'name': 'epoch_s', 'is_dim': False},
{'is_date': True, 'type': 'STRING', 'name': 'string0',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string1', 'is_dim': True},
{'is_date': True, 'type': 'STRING', 'name': 'string2',
'is_dim': False},
{'is_date': False, 'type': 'STRING',
'name': 'string3', 'is_dim': True}], 'name')
, cols
)