SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)

* Use numpy structured arrays, fallback to JSON serialization * Explicitly cast data as list when creating numpy array
2020-02-06 17:25:22 -08:00 · 2020-02-06 17:25:22 -08:00 · 161d211c07
parent 8a138fbd03
commit 161d211c07
2 changed files with 74 additions and 17 deletions
--- a/superset/result_set.py
+++ b/superset/result_set.py
@ -57,6 +57,15 @@ def dedup(l: List[str], suffix: str = "__", case_sensitive: bool = True) -> List
    return new_l


+def stringify(obj: Any) -> str:
+    return json.dumps(obj, default=utils.json_iso_dttm_ser)
+
+
+def stringify_values(array: np.ndarray) -> np.ndarray:
+    vstringify: Callable = np.vectorize(stringify)
+    return vstringify(array)
+
+
 class SupersetResultSet:
    def __init__(
        self,
@ -68,6 +77,8 @@ class SupersetResultSet:
        column_names: List[str] = []
        pa_data: List[pa.Array] = []
        deduped_cursor_desc: List[Tuple[Any, ...]] = []
+        numpy_dtype: List[Tuple[str, ...]] = []
+        stringified_arr: np.ndarray

        if cursor_description:
            # get deduped list of column names
@ -79,33 +90,45 @@ class SupersetResultSet:
                for column_name, description in zip(column_names, cursor_description)
            ]

-        # put data in a 2D array so we can efficiently access each column;
-        array = np.array(data, dtype="object")
-        if array.size > 0:
-            pa_data = [pa.array(array[:, i]) for i, column in enumerate(column_names)]
+            # generate numpy structured array dtype
+            numpy_dtype = [(column_name, "object") for column_name in column_names]
+
+        # put data in a structured array so we can efficiently access each column.
+        # cast `data` as list due to MySQL (others?) wrapping results with a tuple.
+        array = np.array(list(data), dtype=numpy_dtype)
+        if array.size > 0:
+            for column in column_names:
+                try:
+                    pa_data.append(pa.array(array[column].tolist()))
+                except (
+                    pa.lib.ArrowInvalid,
+                    pa.lib.ArrowTypeError,
+                    pa.lib.ArrowNotImplementedError,
+                ):
+                    # attempt serialization of values as strings
+                    stringified_arr = stringify_values(array[column])
+                    pa_data.append(pa.array(stringified_arr.tolist()))

-        # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
-        # related: https://issues.apache.org/jira/browse/ARROW-5248
        if pa_data:
            for i, column in enumerate(column_names):
-                # TODO: revisit nested column serialization once Arrow 1.0 is released with:
-                # https://github.com/apache/arrow/pull/6199
-                # Related issue: #8978
                if pa.types.is_nested(pa_data[i].type):
-                    stringify_func = lambda item: json.dumps(
-                        item, default=utils.json_iso_dttm_ser
-                    )
-                    vfunc = np.vectorize(stringify_func)
-                    strigified_arr = vfunc(array[:, i])
-                    pa_data[i] = pa.array(strigified_arr)
+                    # TODO: revisit nested column serialization once PyArrow updated with:
+                    # https://github.com/apache/arrow/pull/6199
+                    # Related issue: https://github.com/apache/incubator-superset/issues/8978
+                    stringified_arr = stringify_values(array[column])
+                    pa_data[i] = pa.array(stringified_arr.tolist())

                elif pa.types.is_temporal(pa_data[i].type):
-                    sample = self.first_nonempty(array[:, i])
+                    # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
+                    # related: https://issues.apache.org/jira/browse/ARROW-5248
+                    sample = self.first_nonempty(array[column])
                    if sample and isinstance(sample, datetime.datetime):
                        try:
                            if sample.tzinfo:
                                tz = sample.tzinfo
-                                series = pd.Series(array[:, i], dtype="datetime64[ns]")
+                                series = pd.Series(
+                                    array[column], dtype="datetime64[ns]"
+                                )
                                series = pd.to_datetime(series).dt.tz_localize(tz)
                                pa_data[i] = pa.Array.from_pandas(
                                    series, type=pa.timestamp("ns", tz=tz)
--- a/tests/result_set_tests.py
+++ b/tests/result_set_tests.py
@ -166,6 +166,40 @@ class SupersetResultSetTestCase(SupersetTestCase):
            ],
        )

+    def test_single_column_multidim_nested_types(self):
+        data = [
+            (
+                [
+                    "test",
+                    [
+                        [
+                            "foo",
+                            123456,
+                            [
+                                [["test"], 3432546, 7657658766],
+                                [["fake"], 656756765, 324324324324],
+                            ],
+                        ]
+                    ],
+                    ["test2", 43, 765765765],
+                    None,
+                    None,
+                ],
+            )
+        ]
+        cursor_descr = [("metadata",)]
+        results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+        self.assertEqual(results.columns[0]["type"], "STRING")
+        df = results.to_pandas_df()
+        self.assertEqual(
+            df_to_records(df),
+            [
+                {
+                    "metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]'
+                }
+            ],
+        )
+
    def test_empty_datetime(self):
        data = [(None,)]
        cursor_descr = [("ds", "timestamp", None, None, None, None, True)]