From 1f8e48b374adb091bf41e60634e99b132a37bf62 Mon Sep 17 00:00:00 2001
From: Maxime Beauchemin <maximebeauchemin@gmail.com>
Date: Fri, 24 Mar 2017 09:23:51 -0700
Subject: [PATCH] [sqllab] assign types for visualize flow (#2458)

* [sqllab] assign types for visualize flow

Somehow when using the visualize flow, the types were not
assigned at all, creating some bugs downstream. This PR attempts to get
the information required based on what pandas is knows and the types in
the data itself.

* Fixing tests

* Fixing tests

* Fixing more tests

* Fixing the last py3 tests
---
 setup.py               |   1 +
 superset/dataframe.py  | 128 +++++++++++++++++++++++++----------------
 superset/views/core.py |   1 +
 tests/celery_tests.py  |  89 ++++++++++++++++------------
 4 files changed, 135 insertions(+), 84 deletions(-)

diff --git a/setup.py b/setup.py
index 2da3f3254..033d9ac83 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@ setup(
         'flask-script==2.0.5',
         'flask-sqlalchemy==2.0',
         'flask-testing==0.6.1',
+        'future>=0.16.0, <0.17',
         'humanize==0.5.1',
         'gunicorn==19.6.0',
         'markdown==2.6.8',
diff --git a/superset/dataframe.py b/superset/dataframe.py
index 9f7aa88b8..f3b9f3e1b 100644
--- a/superset/dataframe.py
+++ b/superset/dataframe.py
@@ -10,6 +10,9 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 
+from datetime import datetime, date
+from past.builtins import basestring
+
 import pandas as pd
 import numpy as np
 
@@ -19,6 +22,22 @@ INFER_COL_TYPES_SAMPLE_SIZE = 100
 
 
 class SupersetDataFrame(object):
+    # Mapping numpy dtype.char to generic database types
+    type_map = {
+        'b': 'BOOL',  # boolean
+        'i': 'INT',  # (signed) integer
+        'u': 'INT',  # unsigned integer
+        'l': 'INT',  # 64bit integer
+        'f': 'FLOAT',  # floating-point
+        'c': 'FLOAT',  # complex-floating point
+        'm': None,  # timedelta
+        'M': 'DATETIME',  # datetime
+        'O': 'OBJECT',  # (Python) objects
+        'S': 'BYTE',  # (byte-)string
+        'U': 'STRING',  # Unicode
+        'V': None,   # raw data (void)
+    }
+
     def __init__(self, df):
         self.__df = df.where((pd.notnull(df)), None)
 
@@ -30,6 +49,47 @@ class SupersetDataFrame(object):
     def data(self):
         return self.__df.to_dict(orient='records')
 
+    @classmethod
+    def db_type(cls, dtype):
+        """Given a numpy dtype, Returns a generic database type"""
+        return cls.type_map.get(dtype.char)
+
+    @classmethod
+    def datetime_conversion_rate(cls, data_series):
+        success = 0
+        total = 0
+        for value in data_series:
+            total += 1
+            try:
+                pd.to_datetime(value)
+                success += 1
+            except Exception:
+                continue
+        return 100 * success / total
+
+    @classmethod
+    def is_date(cls, dtype):
+        if dtype.name:
+            return dtype.name.startswith('datetime')
+
+    @classmethod
+    def is_dimension(cls, dtype, column_name):
+        if cls.is_id(column_name):
+            return False
+        return dtype.name in ('object', 'bool')
+
+    @classmethod
+    def is_id(cls, column_name):
+        return column_name.startswith('id') or column_name.endswith('id')
+
+    @classmethod
+    def agg_func(cls, dtype, column_name):
+        # consider checking for key substring too.
+        if cls.is_id(column_name):
+            return 'count_distinct'
+        if np.issubdtype(dtype, np.number):
+            return 'sum'
+
     @property
     def columns(self):
         """Provides metadata about columns for data visualization.
@@ -45,22 +105,33 @@ class SupersetDataFrame(object):
         if sample_size:
             sample = self.__df.sample(sample_size)
         for col in self.__df.dtypes.keys():
+            col_db_type = self.db_type(self.__df.dtypes[col])
             column = {
                 'name': col,
-                'type': self.__df.dtypes[col].name,
-                'is_date': is_date(self.__df.dtypes[col]),
-                'is_dim': is_dimension(self.__df.dtypes[col], col),
+                'agg': self.agg_func(self.__df.dtypes[col], col),
+                'type': col_db_type,
+                'is_date': self.is_date(self.__df.dtypes[col]),
+                'is_dim': self.is_dimension(self.__df.dtypes[col], col),
             }
-            agg = agg_func(self.__df.dtypes[col], col)
-            if agg_func:
-                column['agg'] = agg
 
-            if column['type'] == 'object':
+            if column['type'] in ('OBJECT', None):
+                v = sample[col].iloc[0] if not sample[col].empty else None
+                if isinstance(v, basestring):
+                    column['type'] = 'STRING'
+                elif isinstance(v, int):
+                    column['type'] = 'INT'
+                elif isinstance(v, float):
+                    column['type'] = 'FLOAT'
+                elif isinstance(v, (datetime, date)):
+                    column['type'] = 'DATETIME'
+                    column['is_date'] = True
+                    column['is_dim'] = False
                 # check if encoded datetime
-                if (datetime_conversion_rate(sample[col]) >
+                if (
+                        column['type'] == 'STRING' and
+                        self.datetime_conversion_rate(sample[col]) >
                         INFER_COL_TYPES_THRESHOLD):
                     column.update({
-                        'type': 'datetime_string',
                         'is_date': True,
                         'is_dim': False,
                         'agg': None
@@ -70,42 +141,3 @@ class SupersetDataFrame(object):
                 column.pop('agg', None)
             columns.append(column)
         return columns
-
-
-# It will give false positives on the numbers that are stored as strings.
-# It is hard to distinguish integer numbers and timestamps
-def datetime_conversion_rate(data_series):
-    success = 0
-    total = 0
-    for value in data_series:
-        total += 1
-        try:
-            pd.to_datetime(value)
-            success += 1
-        except Exception:
-            continue
-    return 100 * success / total
-
-
-def is_date(dtype):
-    if dtype.name:
-        return dtype.name.startswith('datetime')
-
-
-def is_dimension(dtype, column_name):
-    if is_id(column_name):
-        return False
-    return dtype.name in ('object', 'bool')
-
-
-def is_id(column_name):
-    return column_name.startswith('id') or column_name.endswith('id')
-
-
-def agg_func(dtype, column_name):
-    # consider checking for key substring too.
-    if is_id(column_name):
-        return 'count_distinct'
-    if np.issubdtype(dtype, np.number):
-        return 'sum'
-    return None
diff --git a/superset/views/core.py b/superset/views/core.py
index f077e481c..a3b7d110b 100755
--- a/superset/views/core.py
+++ b/superset/views/core.py
@@ -1781,6 +1781,7 @@ class Superset(BaseSupersetView):
                 filterable=is_dim,
                 groupby=is_dim,
                 is_dttm=config.get('is_date', False),
+                type=config.get('type', False),
             )
             cols.append(col)
             if is_dim:
diff --git a/tests/celery_tests.py b/tests/celery_tests.py
index 8da39be96..43e1b6f29 100644
--- a/tests/celery_tests.py
+++ b/tests/celery_tests.py
@@ -9,6 +9,7 @@ import os
 import subprocess
 import time
 import unittest
+from past.builtins import basestring
 
 import pandas as pd
 
@@ -238,49 +239,65 @@ class CeleryTestCase(SupersetTestCase):
         self.assertEqual(True, query.select_as_cta)
         self.assertEqual(True, query.select_as_cta_used)
 
+    @staticmethod
+    def de_unicode_dict(d):
+        def str_if_basestring(o):
+            if isinstance(o, basestring):
+                return str(o)
+            return o
+        return {str_if_basestring(k): str_if_basestring(d[k]) for k in d}
+
+    @classmethod
+    def dictify_list_of_dicts(cls, l, k):
+        return {str(o[k]): cls.de_unicode_dict(o) for o in l}
+
     def test_get_columns(self):
         main_db = self.get_main_database(db.session)
         df = main_db.get_df("SELECT * FROM multiformat_time_series", None)
         cdf = dataframe.SupersetDataFrame(df)
+
+        # Making ordering non-deterministic
+        cols = self.dictify_list_of_dicts(cdf.columns, 'name')
+
         if main_db.sqlalchemy_uri.startswith('sqlite'):
-            self.assertEqual(
-                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
-                  'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'ds2',
-                  'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_ms', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_s', 'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string1', 'is_dim': True},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string3', 'is_dim': True}]
-                , cdf.columns
+            self.assertEqual(self.dictify_list_of_dicts([
+                {'is_date': True, 'type': 'STRING', 'name': 'ds',
+                    'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'ds2',
+                    'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_ms', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_s', 'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'string0',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string1', 'is_dim': True},
+                {'is_date': True, 'type': 'STRING', 'name': 'string2',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string3', 'is_dim': True}], 'name')
+                , cols
             )
         else:
-            self.assertEqual(
-                [{'is_date': True, 'type': 'datetime_string', 'name': 'ds',
-                  'is_dim': False},
-                 {'is_date': True, 'type': 'datetime64[ns]',
-                  'name': 'ds2', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_ms', 'is_dim': False},
-                 {'agg': 'sum', 'is_date': False, 'type': 'int64',
-                  'name': 'epoch_s', 'is_dim': False},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string0',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string1', 'is_dim': True},
-                 {'is_date': True, 'type': 'datetime_string', 'name': 'string2',
-                  'is_dim': False},
-                 {'is_date': False, 'type': 'object',
-                  'name': 'string3', 'is_dim': True}]
-                , cdf.columns
+            self.assertEqual(self.dictify_list_of_dicts([
+                {'is_date': True, 'type': 'DATETIME', 'name': 'ds',
+                    'is_dim': False},
+                {'is_date': True, 'type': 'DATETIME',
+                    'name': 'ds2', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_ms', 'is_dim': False},
+                {'agg': 'sum', 'is_date': False, 'type': 'INT',
+                    'name': 'epoch_s', 'is_dim': False},
+                {'is_date': True, 'type': 'STRING', 'name': 'string0',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string1', 'is_dim': True},
+                {'is_date': True, 'type': 'STRING', 'name': 'string2',
+                    'is_dim': False},
+                {'is_date': False, 'type': 'STRING',
+                    'name': 'string3', 'is_dim': True}], 'name')
+                , cols
             )